155 files changed, 5143 insertions, 4960 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index c53a9f91cc0..5844c2480d6 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -41,61 +41,65 @@ elseif(WIN32 AND MSVC)
 		set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2")
 	endif()
 
+	# Unlike GCC/clang we still use fast math, because there is no fine
+	# grained control and the speedup we get here is too big to ignore.
+	set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+
 	# there is no /arch:SSE3, but intrinsics are available anyway
 	if(CMAKE_CL_64)
-		set(CYCLES_SSE2_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_SSE3_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_SSE41_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
 	else()
-		set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+		set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_SSE41_KERNEL_FLAGS "/arch:SSE2 ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_AVX_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
+		set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_AVX2_ARCH_FLAGS} ${CYCLES_KERNEL_FLAGS}")
 	endif()
 
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}")
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox")
 	set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox")
 	set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox")
-
-	set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
-elseif(CMAKE_COMPILER_IS_GNUCC)
+elseif(CMAKE_COMPILER_IS_GNUCC OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
 	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
-	set(CYCLES_KERNEL_FLAGS "-ffast-math")
-	if(CXX_HAS_SSE)
-		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
-		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
-		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse")
-	endif()
-	if(CXX_HAS_AVX)
-		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mfpmath=sse")
-	endif()
-	if(CXX_HAS_AVX2)
-		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
+
+	# Assume no signal trapping for better code generation.
+	set(CYCLES_KERNEL_FLAGS "-fno-trapping-math")
+	# Avoid overhead of setting errno for NaNs.
+	set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-math-errno")
+	# Let compiler optimize 0.0 - x without worrying about signed zeros.
+	set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signed-zeros")
+
+	if(CMAKE_COMPILER_IS_GNUCC)
+		# Assume no signal trapping for better code generation.
+		set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-signaling-nans")
+		# Assume a fixed rounding mode for better constant folding.
+		set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -fno-rounding-math")
 	endif()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
-elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
-	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
-	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
-	set(CYCLES_KERNEL_FLAGS "-ffast-math")
+
 	if(CXX_HAS_SSE)
-		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
-		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
-		set(CYCLES_SSE41_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1")
-	endif()
-	if(CXX_HAS_AVX)
-		set(CYCLES_AVX_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx")
-	endif()
-	if(CXX_HAS_AVX2)
-		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
+		if(CMAKE_COMPILER_IS_GNUCC)
+			set(CYCLES_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -mfpmath=sse")
+		endif()
+
+		set(CYCLES_SSE2_KERNEL_FLAGS "${CYCLES_KERNEL_FLAGS} -msse -msse2")
+		set(CYCLES_SSE3_KERNEL_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS} -msse3 -mssse3")
+		set(CYCLES_SSE41_KERNEL_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS} -msse4.1")
+		if(CXX_HAS_AVX)
+			set(CYCLES_AVX_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx")
+		endif()
+		if(CXX_HAS_AVX2)
+			set(CYCLES_AVX2_KERNEL_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS} -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
+		endif()
 	endif()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
+
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CYCLES_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_SSE)
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index 82c4ffc6e50..17efb00abdb 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -32,14 +32,11 @@ class AddPresetIntegrator(AddPresetBase, Operator):
 
     preset_values = [
         "cycles.max_bounces",
-        "cycles.min_bounces",
         "cycles.diffuse_bounces",
         "cycles.glossy_bounces",
         "cycles.transmission_bounces",
         "cycles.volume_bounces",
-        "cycles.transparent_min_bounces",
         "cycles.transparent_max_bounces",
-        "cycles.use_transparent_shadows",
         "cycles.caustics_reflective",
         "cycles.caustics_refractive",
         "cycles.blur_glossy"
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 68474529ed3..7b16ef1d543 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -205,13 +205,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 name="AA Samples",
                 description="Number of antialiasing samples to render for each pixel",
                 min=1, max=2097151,
-                default=4,
+                default=128,
                 )
         cls.preview_aa_samples = IntProperty(
                 name="AA Samples",
                 description="Number of antialiasing samples to render in the viewport, unlimited if 0",
                 min=0, max=2097151,
-                default=4,
+                default=32,
                 )
         cls.diffuse_samples = IntProperty(
                 name="Diffuse Samples",
@@ -308,17 +308,9 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 description="Adaptively blur glossy shaders after blurry bounces, "
                             "to reduce noise at the cost of accuracy",
                 min=0.0, max=10.0,
-                default=0.0,
+                default=1.0,
                 )
 
-        cls.min_bounces = IntProperty(
-                name="Min Bounces",
-                description="Minimum number of bounces, setting this lower "
-                            "than the maximum enables probabilistic path "
-                            "termination (faster but noisier)",
-                min=0, max=1024,
-                default=3,
-                )
         cls.max_bounces = IntProperty(
                 name="Max Bounces",
                 description="Total maximum number of bounces",
@@ -351,26 +343,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                 default=0,
                 )
 
-        cls.transparent_min_bounces = IntProperty(
-                name="Transparent Min Bounces",
-                description="Minimum number of transparent bounces, setting "
-                            "this lower than the maximum enables "
-                            "probabilistic path termination (faster but "
-                            "noisier)",
-                min=0, max=1024,
-                default=8,
-                )
         cls.transparent_max_bounces = IntProperty(
                 name="Transparent Max Bounces",
                 description="Maximum number of transparent bounces",
                 min=0, max=1024,
                 default=8,
                 )
-        cls.use_transparent_shadows = BoolProperty(
-                name="Transparent Shadows",
-                description="Use transparency of surfaces for rendering shadows",
-                default=True,
-                )
 
         cls.volume_step_size = FloatProperty(
                 name="Step Size",
@@ -475,7 +453,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
                             "higher values will be scaled down to avoid too "
                             "much noise and slow convergence at the cost of accuracy",
                 min=0.0, max=1e8,
-                default=0.0,
+                default=10.0,
                 )
 
         cls.debug_tile_size = IntProperty(
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 49beebe5ab4..7ab47455c49 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -139,7 +139,7 @@ def draw_samples_info(layout, context):
                       (ao * aa, ml * aa, sss * aa, vol * aa))
 
 
-class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
     bl_label = "Sampling"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -214,7 +214,7 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
         draw_samples_info(layout, context)
 
 
-class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_geometry(CyclesButtonsPanel, Panel):
     bl_label = "Geometry"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -270,7 +270,7 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
         row.prop(ccscene, "maximum_width", text="Max Extension")
 
 
-class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_light_paths(CyclesButtonsPanel, Panel):
     bl_label = "Light Paths"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -292,8 +292,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub = col.column(align=True)
         sub.label("Transparency:")
         sub.prop(cscene, "transparent_max_bounces", text="Max")
-        sub.prop(cscene, "transparent_min_bounces", text="Min")
-        sub.prop(cscene, "use_transparent_shadows", text="Shadows")
 
         col.separator()
 
@@ -306,7 +304,6 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub = col.column(align=True)
         sub.label(text="Bounces:")
         sub.prop(cscene, "max_bounces", text="Max")
-        sub.prop(cscene, "min_bounces", text="Min")
 
         sub = col.column(align=True)
         sub.prop(cscene, "diffuse_bounces", text="Diffuse")
@@ -315,7 +312,7 @@ class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
         sub.prop(cscene, "volume_bounces", text="Volume")
 
 
-class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_motion_blur(CyclesButtonsPanel, Panel):
     bl_label = "Motion Blur"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -356,7 +353,7 @@ class CyclesRender_PT_motion_blur(CyclesButtonsPanel, Panel):
         row.prop(cscene, "rolling_shutter_duration")
 
 
-class CyclesRender_PT_film(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_film(CyclesButtonsPanel, Panel):
     bl_label = "Film"
 
     def draw(self, context):
@@ -378,7 +375,7 @@ class CyclesRender_PT_film(CyclesButtonsPanel, Panel):
             sub.prop(cscene, "filter_width", text="Width")
 
 
-class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_performance(CyclesButtonsPanel, Panel):
     bl_label = "Performance"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -399,6 +396,8 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         sub.enabled = rd.threads_mode == 'FIXED'
         sub.prop(rd, "threads")
 
+        col.separator()
+
         sub = col.column(align=True)
         sub.label(text="Tiles:")
         sub.prop(cscene, "tile_order", text="")
@@ -408,19 +407,10 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
 
         sub.prop(cscene, "use_progressive_refine")
 
-        subsub = sub.column(align=True)
-        subsub.prop(rd, "use_save_buffers")
-
-        col = split.column(align=True)
-
-        col.label(text="Viewport:")
-        col.prop(cscene, "debug_bvh_type", text="")
-        col.separator()
-        col.prop(cscene, "preview_start_resolution")
-
-        col.separator()
+        col = split.column()
 
         col.label(text="Final Render:")
+        col.prop(rd, "use_save_buffers")
         col.prop(rd, "use_persistent_data", text="Persistent Images")
 
         col.separator()
@@ -433,8 +423,14 @@ class CyclesRender_PT_performance(CyclesButtonsPanel, Panel):
         row.active = not cscene.debug_use_spatial_splits
         row.prop(cscene, "debug_bvh_time_steps")
 
+        col = layout.column()
+        col.label(text="Viewport Resolution:")
+        split = col.split()
+        split.prop(rd, "preview_pixel_size", text="")
+        split.prop(cscene, "preview_start_resolution")
+
 
-class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_layer_options(CyclesButtonsPanel, Panel):
     bl_label = "Layer"
     bl_context = "render_layer"
 
@@ -470,7 +466,7 @@ class CyclesRender_PT_layer_options(CyclesButtonsPanel, Panel):
         col.prop(rl, "use_strand", "Use Hair")
 
 
-class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_layer_passes(CyclesButtonsPanel, Panel):
     bl_label = "Passes"
     bl_context = "render_layer"
     bl_options = {'DEFAULT_CLOSED'}
@@ -544,7 +540,7 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
             col.prop(crl, "pass_debug_ray_bounces")
 
 
-class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_views(CyclesButtonsPanel, Panel):
     bl_label = "Views"
     bl_context = "render_layer"
     bl_options = {'DEFAULT_CLOSED'}
@@ -587,7 +583,7 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
             row.prop(rv, "camera_suffix", text="")
 
 
-class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
     bl_label = "Denoising"
     bl_context = "render_layer"
     bl_options = {'DEFAULT_CLOSED'}
@@ -652,7 +648,7 @@ class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
         sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
 
 
-class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
+class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
     bl_options = {'DEFAULT_CLOSED'}
 
@@ -671,7 +667,7 @@ class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
         col.prop(rd, "dither_intensity", text="Dither", slider=True)
 
 
-class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel):
+class CYCLES_CAMERA_PT_dof(CyclesButtonsPanel, Panel):
     bl_label = "Depth of Field"
     bl_context = "data"
 
@@ -722,7 +718,7 @@ class CyclesCamera_PT_dof(CyclesButtonsPanel, Panel):
         sub.prop(ccam, "aperture_ratio", text="Ratio")
 
 
-class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
+class CYCLES_PT_context_material(CyclesButtonsPanel, Panel):
     bl_label = ""
     bl_context = "material"
     bl_options = {'HIDE_HEADER'}
@@ -782,7 +778,7 @@ class Cycles_PT_context_material(CyclesButtonsPanel, Panel):
             split.separator()
 
 
-class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
+class CYCLES_OBJECT_PT_motion_blur(CyclesButtonsPanel, Panel):
     bl_label = "Motion Blur"
     bl_context = "object"
     bl_options = {'DEFAULT_CLOSED'}
@@ -830,7 +826,7 @@ class CyclesObject_PT_motion_blur(CyclesButtonsPanel, Panel):
         sub.prop(cob, "motion_steps", text="Steps")
 
 
-class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
+class CYCLES_OBJECT_PT_cycles_settings(CyclesButtonsPanel, Panel):
     bl_label = "Cycles Settings"
     bl_context = "object"
     bl_options = {'DEFAULT_CLOSED'}
@@ -939,7 +935,7 @@ def panel_node_draw(layout, id_data, output_type, input_name):
     return True
 
 
-class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel):
+class CYCLES_LAMP_PT_preview(CyclesButtonsPanel, Panel):
     bl_label = "Preview"
     bl_context = "data"
     bl_options = {'DEFAULT_CLOSED'}
@@ -955,7 +951,7 @@ class CyclesLamp_PT_preview(CyclesButtonsPanel, Panel):
         self.layout.template_preview(context.lamp)
 
 
-class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel):
+class CYCLES_LAMP_PT_lamp(CyclesButtonsPanel, Panel):
     bl_label = "Lamp"
     bl_context = "data"
 
@@ -1009,7 +1005,7 @@ class CyclesLamp_PT_lamp(CyclesButtonsPanel, Panel):
             layout.label(text="Not supported, interpreted as sun lamp")
 
 
-class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel):
+class CYCLES_LAMP_PT_nodes(CyclesButtonsPanel, Panel):
     bl_label = "Nodes"
     bl_context = "data"
 
@@ -1027,7 +1023,7 @@ class CyclesLamp_PT_nodes(CyclesButtonsPanel, Panel):
             layout.prop(lamp, "color")
 
 
-class CyclesLamp_PT_spot(CyclesButtonsPanel, Panel):
+class CYCLES_LAMP_PT_spot(CyclesButtonsPanel, Panel):
     bl_label = "Spot Shape"
     bl_context = "data"
 
@@ -1052,7 +1048,7 @@ class CyclesLamp_PT_spot(CyclesButtonsPanel, Panel):
         col.prop(lamp, "show_cone")
 
 
-class CyclesWorld_PT_preview(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_preview(CyclesButtonsPanel, Panel):
     bl_label = "Preview"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1065,7 +1061,7 @@ class CyclesWorld_PT_preview(CyclesButtonsPanel, Panel):
         self.layout.template_preview(context.world)
 
 
-class CyclesWorld_PT_surface(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_surface(CyclesButtonsPanel, Panel):
     bl_label = "Surface"
     bl_context = "world"
 
@@ -1082,7 +1078,7 @@ class CyclesWorld_PT_surface(CyclesButtonsPanel, Panel):
             layout.prop(world, "horizon_color", text="Color")
 
 
-class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel):
     bl_label = "Volume"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1099,7 +1095,7 @@ class CyclesWorld_PT_volume(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume')
 
 
-class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
     bl_label = "Ambient Occlusion"
     bl_context = "world"
 
@@ -1124,7 +1120,7 @@ class CyclesWorld_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
         row.prop(light, "distance", text="Distance")
 
 
-class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel):
     bl_label = "Mist Pass"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1151,7 +1147,7 @@ class CyclesWorld_PT_mist(CyclesButtonsPanel, Panel):
         layout.prop(world.mist_settings, "falloff")
 
 
-class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_ray_visibility(CyclesButtonsPanel, Panel):
     bl_label = "Ray Visibility"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1175,7 +1171,7 @@ class CyclesWorld_PT_ray_visibility(CyclesButtonsPanel, Panel):
         flow.prop(visibility, "scatter")
 
 
-class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
+class CYCLES_WORLD_PT_settings(CyclesButtonsPanel, Panel):
     bl_label = "Settings"
     bl_context = "world"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1216,7 +1212,7 @@ class CyclesWorld_PT_settings(CyclesButtonsPanel, Panel):
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
 
 
-class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_preview(CyclesButtonsPanel, Panel):
     bl_label = "Preview"
     bl_context = "material"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1229,7 +1225,7 @@ class CyclesMaterial_PT_preview(CyclesButtonsPanel, Panel):
         self.layout.template_preview(context.material)
 
 
-class CyclesMaterial_PT_surface(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_surface(CyclesButtonsPanel, Panel):
     bl_label = "Surface"
     bl_context = "material"
 
@@ -1245,7 +1241,7 @@ class CyclesMaterial_PT_surface(CyclesButtonsPanel, Panel):
             layout.prop(mat, "diffuse_color")
 
 
-class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_volume(CyclesButtonsPanel, Panel):
     bl_label = "Volume"
     bl_context = "material"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1264,7 +1260,7 @@ class CyclesMaterial_PT_volume(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Volume')
 
 
-class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_displacement(CyclesButtonsPanel, Panel):
     bl_label = "Displacement"
     bl_context = "material"
 
@@ -1280,7 +1276,7 @@ class CyclesMaterial_PT_displacement(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, mat, 'OUTPUT_MATERIAL', 'Displacement')
 
 
-class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
+class CYCLES_MATERIAL_PT_settings(CyclesButtonsPanel, Panel):
     bl_label = "Settings"
     bl_context = "material"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1335,7 +1331,7 @@ class CyclesMaterial_PT_settings(CyclesButtonsPanel, Panel):
         col.prop(mat, "pass_index")
 
 
-class CyclesTexture_PT_context(CyclesButtonsPanel, Panel):
+class CYCLES_TEXTURE_PT_context(CyclesButtonsPanel, Panel):
     bl_label = ""
     bl_context = "texture"
     bl_options = {'HIDE_HEADER'}
@@ -1376,7 +1372,7 @@ class CyclesTexture_PT_context(CyclesButtonsPanel, Panel):
                 split.prop(tex, "type", text="")
 
 
-class CyclesTexture_PT_node(CyclesButtonsPanel, Panel):
+class CYCLES_TEXTURE_PT_node(CyclesButtonsPanel, Panel):
     bl_label = "Node"
     bl_context = "texture"
 
@@ -1393,7 +1389,7 @@ class CyclesTexture_PT_node(CyclesButtonsPanel, Panel):
         layout.template_node_view(ntree, node, None)
 
 
-class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel):
+class CYCLES_TEXTURE_PT_mapping(CyclesButtonsPanel, Panel):
     bl_label = "Mapping"
     bl_context = "texture"
 
@@ -1426,7 +1422,7 @@ class CyclesTexture_PT_mapping(CyclesButtonsPanel, Panel):
         row.prop(mapping, "mapping_z", text="")
 
 
-class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel):
+class CYCLES_TEXTURE_PT_colors(CyclesButtonsPanel, Panel):
     bl_label = "Color"
     bl_context = "texture"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1465,7 +1461,7 @@ class CyclesTexture_PT_colors(CyclesButtonsPanel, Panel):
             layout.template_color_ramp(mapping, "color_ramp", expand=True)
 
 
-class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel):
+class CYCLES_PARTICLE_PT_textures(CyclesButtonsPanel, Panel):
     bl_label = "Textures"
     bl_context = "particle"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1496,7 +1492,7 @@ class CyclesParticle_PT_textures(CyclesButtonsPanel, Panel):
             layout.template_ID(slot, "texture", new="texture.new")
 
 
-class CyclesRender_PT_bake(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
     bl_label = "Bake"
     bl_context = "render"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1569,7 +1565,7 @@ class CyclesRender_PT_bake(CyclesButtonsPanel, Panel):
             sub.prop(cbk, "cage_extrusion", text="Ray Distance")
 
 
-class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
     bl_label = "Debug"
     bl_context = "render"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1597,11 +1593,15 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         col.prop(cscene, "debug_use_qbvh")
         col.prop(cscene, "debug_use_cpu_split_kernel")
 
+        col.separator()
+
         col = layout.column()
         col.label('CUDA Flags:')
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
         col.prop(cscene, "debug_use_cuda_split_kernel")
 
+        col.separator()
+
         col = layout.column()
         col.label('OpenCL Flags:')
         col.prop(cscene, "debug_opencl_kernel_type", text="Kernel")
@@ -1610,8 +1610,13 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         col.prop(cscene, "debug_use_opencl_debug", text="Debug")
         col.prop(cscene, "debug_opencl_mem_limit")
 
+        col.separator()
+
+        col = layout.column()
+        col.prop(cscene, "debug_bvh_type")
+
 
-class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
+class CYCLES_PARTICLE_PT_curve_settings(CyclesButtonsPanel, Panel):
     bl_label = "Cycles Hair Settings"
     bl_context = "particle"
 
@@ -1642,7 +1647,7 @@ class CyclesParticle_PT_CurveSettings(CyclesButtonsPanel, Panel):
         row.prop(cpsys, "use_closetip", text="Close tip")
 
 
-class CyclesScene_PT_simplify(CyclesButtonsPanel, Panel):
+class CYCLES_SCENE_PT_simplify(CyclesButtonsPanel, Panel):
     bl_label = "Simplify"
     bl_context = "scene"
     COMPAT_ENGINES = {'CYCLES'}
@@ -1797,47 +1802,47 @@ def get_panels():
 classes = (
     CYCLES_MT_sampling_presets,
     CYCLES_MT_integrator_presets,
-    CyclesRender_PT_sampling,
-    CyclesRender_PT_geometry,
-    CyclesRender_PT_light_paths,
-    CyclesRender_PT_motion_blur,
-    CyclesRender_PT_film,
-    CyclesRender_PT_performance,
-    CyclesRender_PT_layer_options,
-    CyclesRender_PT_layer_passes,
-    CyclesRender_PT_views,
-    CyclesRender_PT_denoising,
-    Cycles_PT_post_processing,
-    CyclesCamera_PT_dof,
-    Cycles_PT_context_material,
-    CyclesObject_PT_motion_blur,
-    CyclesObject_PT_cycles_settings,
+    CYCLES_RENDER_PT_sampling,
+    CYCLES_RENDER_PT_geometry,
+    CYCLES_RENDER_PT_light_paths,
+    CYCLES_RENDER_PT_motion_blur,
+    CYCLES_RENDER_PT_film,
+    CYCLES_RENDER_PT_performance,
+    CYCLES_RENDER_PT_layer_options,
+    CYCLES_RENDER_PT_layer_passes,
+    CYCLES_RENDER_PT_views,
+    CYCLES_RENDER_PT_denoising,
+    CYCLES_PT_post_processing,
+    CYCLES_CAMERA_PT_dof,
+    CYCLES_PT_context_material,
+    CYCLES_OBJECT_PT_motion_blur,
+    CYCLES_OBJECT_PT_cycles_settings,
     CYCLES_OT_use_shading_nodes,
-    CyclesLamp_PT_preview,
-    CyclesLamp_PT_lamp,
-    CyclesLamp_PT_nodes,
-    CyclesLamp_PT_spot,
-    CyclesWorld_PT_preview,
-    CyclesWorld_PT_surface,
-    CyclesWorld_PT_volume,
-    CyclesWorld_PT_ambient_occlusion,
-    CyclesWorld_PT_mist,
-    CyclesWorld_PT_ray_visibility,
-    CyclesWorld_PT_settings,
-    CyclesMaterial_PT_preview,
-    CyclesMaterial_PT_surface,
-    CyclesMaterial_PT_volume,
-    CyclesMaterial_PT_displacement,
-    CyclesMaterial_PT_settings,
-    CyclesTexture_PT_context,
-    CyclesTexture_PT_node,
-    CyclesTexture_PT_mapping,
-    CyclesTexture_PT_colors,
-    CyclesParticle_PT_textures,
-    CyclesRender_PT_bake,
-    CyclesRender_PT_debug,
-    CyclesParticle_PT_CurveSettings,
-    CyclesScene_PT_simplify,
+    CYCLES_LAMP_PT_preview,
+    CYCLES_LAMP_PT_lamp,
+    CYCLES_LAMP_PT_nodes,
+    CYCLES_LAMP_PT_spot,
+    CYCLES_WORLD_PT_preview,
+    CYCLES_WORLD_PT_surface,
+    CYCLES_WORLD_PT_volume,
+    CYCLES_WORLD_PT_ambient_occlusion,
+    CYCLES_WORLD_PT_mist,
+    CYCLES_WORLD_PT_ray_visibility,
+    CYCLES_WORLD_PT_settings,
+    CYCLES_MATERIAL_PT_preview,
+    CYCLES_MATERIAL_PT_surface,
+    CYCLES_MATERIAL_PT_volume,
+    CYCLES_MATERIAL_PT_displacement,
+    CYCLES_MATERIAL_PT_settings,
+    CYCLES_TEXTURE_PT_context,
+    CYCLES_TEXTURE_PT_node,
+    CYCLES_TEXTURE_PT_mapping,
+    CYCLES_TEXTURE_PT_colors,
+    CYCLES_PARTICLE_PT_textures,
+    CYCLES_RENDER_PT_bake,
+    CYCLES_RENDER_PT_debug,
+    CYCLES_PARTICLE_PT_curve_settings,
+    CYCLES_SCENE_PT_simplify,
 )
 
 
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index b2a745500a1..efd794461d6 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -302,3 +302,16 @@ def do_versions(self):
             cscene = scene.cycles
             if not cscene.is_property_set("light_sampling_threshold"):
                 cscene.light_sampling_threshold = 0.0
+
+    if bpy.data.version <= (2, 79, 0):
+        for scene in bpy.data.scenes:
+            cscene = scene.cycles
+            # Default changes
+            if not cscene.is_property_set("aa_samples"):
+                cscene.aa_samples = 4
+            if not cscene.is_property_set("preview_aa_samples"):
+                cscene.preview_aa_samples = 4
+            if not cscene.is_property_set("blur_glossy"):
+                cscene.blur_glossy = 0.0
+            if not cscene.is_property_set("sample_clamp_indirect"):
+                cscene.sample_clamp_indirect = 0.0
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index 40d6b25f2b7..b29711d30d3 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -544,7 +544,11 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render,
 
 	if(tfm != cam->matrix) {
 		VLOG(1) << "Camera " << b_ob.name() << " motion detected.";
-		if(motion_time == -1.0f) {
+		if(motion_time == 0.0f) {
+			/* When motion blur is not centered in frame, cam->matrix gets reset. */
+			cam->matrix = tfm;
+		}
+		else if(motion_time == -1.0f) {
 			cam->motion.pre = tfm;
 			cam->use_motion = true;
 		}
@@ -573,7 +577,10 @@ void BlenderSync::sync_camera_motion(BL::RenderSettings& b_render,
 		float fov = 2.0f * atanf((0.5f * sensor_size) / bcam.lens / aspectratio);
 		if(fov != cam->fov) {
 			VLOG(1) << "Camera " << b_ob.name() << " FOV change detected.";
-			if(motion_time == -1.0f) {
+			if(motion_time == 0.0f) {
+				cam->fov = fov;
+			}
+			else if(motion_time == -1.0f) {
 				cam->fov_pre = fov;
 				cam->use_perspective_motion = true;
 			}
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 3ebe2d8cf34..4091c44d379 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -50,8 +50,7 @@ enum {
  * Two triangles has vertex indices in the original Blender-side face.
  * If face is already a quad tri_b will not be initialized.
  */
-inline void face_split_tri_indices(const int num_verts,
-                                   const int face_flag,
+inline void face_split_tri_indices(const int face_flag,
                                    int tri_a[3],
                                    int tri_b[3])
 {
@@ -59,36 +58,37 @@ inline void face_split_tri_indices(const int num_verts,
 		tri_a[0] = 0;
 		tri_a[1] = 1;
 		tri_a[2] = 3;
-		if(num_verts == 4) {
-			tri_b[0] = 2;
-			tri_b[1] = 3;
-			tri_b[2] = 1;
-		}
+
+		tri_b[0] = 2;
+		tri_b[1] = 3;
+		tri_b[2] = 1;
 	}
-	else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ {
+	else {
+		/* Quad with FACE_FLAG_DIVIDE_13 or single triangle. */
 		tri_a[0] = 0;
 		tri_a[1] = 1;
 		tri_a[2] = 2;
-		if(num_verts == 4) {
-			tri_b[0] = 0;
-			tri_b[1] = 2;
-			tri_b[2] = 3;
-		}
+
+		tri_b[0] = 0;
+		tri_b[1] = 2;
+		tri_b[2] = 3;
 	}
 }
 
 /* Tangent Space */
 
 struct MikkUserData {
-	MikkUserData(const BL::Mesh& mesh_,
-	             BL::MeshTextureFaceLayer *layer_,
-	             int num_faces_)
-	: mesh(mesh_), layer(layer_), num_faces(num_faces_)
+	MikkUserData(const BL::Mesh& b_mesh,
+	             BL::MeshTextureFaceLayer *layer,
+	             int num_faces)
+	        : b_mesh(b_mesh),
+	          layer(layer),
+	          num_faces(num_faces)
 	{
 		tangent.resize(num_faces*4);
 	}
 
-	BL::Mesh mesh;
+	BL::Mesh b_mesh;
 	BL::MeshTextureFaceLayer *layer;
 	int num_faces;
 	vector<float4> tangent;
@@ -103,7 +103,7 @@ static int mikk_get_num_faces(const SMikkTSpaceContext *context)
 static int mikk_get_num_verts_of_face(const SMikkTSpaceContext *context, const int face_num)
 {
 	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-	BL::MeshTessFace f = userdata->mesh.tessfaces[face_num];
+	BL::MeshTessFace f = userdata->b_mesh.tessfaces[face_num];
 	int4 vi = get_int4(f.vertices_raw());
 
 	return (vi[3] == 0)? 3: 4;
@@ -112,9 +112,9 @@ static int mikk_get_num_verts_of_face(const SMikkTSpaceContext *context, const i
 static void mikk_get_position(const SMikkTSpaceContext *context, float P[3], const int face_num, const int vert_num)
 {
 	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-	BL::MeshTessFace f = userdata->mesh.tessfaces[face_num];
+	BL::MeshTessFace f = userdata->b_mesh.tessfaces[face_num];
 	int4 vi = get_int4(f.vertices_raw());
-	BL::MeshVertex v = userdata->mesh.vertices[vi[vert_num]];
+	BL::MeshVertex v = userdata->b_mesh.vertices[vi[vert_num]];
 	float3 vP = get_float3(v.co());
 
 	P[0] = vP.x;
@@ -148,9 +148,9 @@ static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float
 		uv[1] = tfuv.y;
 	}
 	else {
-		int vert_idx = userdata->mesh.tessfaces[face_num].vertices()[vert_num];
+		int vert_idx = userdata->b_mesh.tessfaces[face_num].vertices()[vert_num];
 		float3 orco =
-			get_float3(userdata->mesh.vertices[vert_idx].undeformed_co());
+			get_float3(userdata->b_mesh.vertices[vert_idx].undeformed_co());
 		float2 tmp = map_to_sphere(make_float3(orco[0], orco[1], orco[2]));
 		uv[0] = tmp.x;
 		uv[1] = tmp.y;
@@ -160,12 +160,12 @@ static void mikk_get_texture_coordinate(const SMikkTSpaceContext *context, float
 static void mikk_get_normal(const SMikkTSpaceContext *context, float N[3], const int face_num, const int vert_num)
 {
 	MikkUserData *userdata = (MikkUserData*)context->m_pUserData;
-	BL::MeshTessFace f = userdata->mesh.tessfaces[face_num];
+	BL::MeshTessFace f = userdata->b_mesh.tessfaces[face_num];
 	float3 vN;
 
 	if(f.use_smooth()) {
 		int4 vi = get_int4(f.vertices_raw());
-		BL::MeshVertex v = userdata->mesh.vertices[vi[vert_num]];
+		BL::MeshVertex v = userdata->b_mesh.vertices[vi[vert_num]];
 		vN = get_float3(v.normal());
 	}
 	else {
@@ -250,7 +250,7 @@ static void mikk_compute_tangents(BL::Mesh& b_mesh,
 
 	for(int i = 0; i < nverts.size(); i++) {
 		int tri_a[3], tri_b[3];
-		face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+		face_split_tri_indices(face_flags[i], tri_a, tri_b);
 
 		tangent[0] = float4_to_float3(userdata.tangent[i*4 + tri_a[0]]);
 		tangent[1] = float4_to_float3(userdata.tangent[i*4 + tri_a[1]]);
@@ -376,7 +376,7 @@ static void attr_create_vertex_color(Scene *scene,
 
 			for(l->data.begin(c); c != l->data.end(); ++c, ++i) {
 				int tri_a[3], tri_b[3];
-				face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+				face_split_tri_indices(face_flags[i], tri_a, tri_b);
 
 				uchar4 colors[4];
 				colors[0] = color_float_to_byte(color_srgb_to_scene_linear_v3(get_float3(c->color1())));
@@ -469,7 +469,7 @@ static void attr_create_uv_map(Scene *scene,
 
 				for(l->data.begin(t); t != l->data.end(); ++t, ++i) {
 					int tri_a[3], tri_b[3];
-					face_split_tri_indices(nverts[i], face_flags[i], tri_a, tri_b);
+					face_split_tri_indices(face_flags[i], tri_a, tri_b);
 
 					float3 uvs[4];
 					uvs[0] = get_float3(t->uv1());
@@ -719,6 +719,11 @@ static void create_mesh(Scene *scene,
 	int numngons = 0;
 	bool use_loop_normals = b_mesh.use_auto_smooth() && (mesh->subdivision_type != Mesh::SUBDIVISION_CATMULL_CLARK);
 
+	/* If no faces, create empty mesh. */
+	if(numfaces == 0) {
+		return;
+	}
+
 	BL::Mesh::vertices_iterator v;
 	BL::Mesh::tessfaces_iterator f;
 	BL::Mesh::polygons_iterator p;
@@ -1079,7 +1084,7 @@ Mesh *BlenderSync::sync_mesh(BL::Object& b_ob,
 			}
 
 			/* free derived mesh */
-			b_data.meshes.remove(b_mesh, false);
+			b_data.meshes.remove(b_mesh, false, true, false);
 		}
 	}
 	mesh->geometry_flags = requested_geometry_flags;
@@ -1299,7 +1304,7 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 		sync_curves(mesh, b_mesh, b_ob, true, time_index);
 
 	/* free derived mesh */
-	b_data.meshes.remove(b_mesh, false);
+	b_data.meshes.remove(b_mesh, false, true, false);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index a930c439370..63138c060fb 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -63,8 +63,25 @@ bool BlenderSync::object_is_mesh(BL::Object& b_ob)
 {
 	BL::ID b_ob_data = b_ob.data();
 
-	return (b_ob_data && (b_ob_data.is_a(&RNA_Mesh) ||
-		b_ob_data.is_a(&RNA_Curve) || b_ob_data.is_a(&RNA_MetaBall)));
+	if(!b_ob_data) {
+		return false;
+	}
+
+	if(b_ob.type() == BL::Object::type_CURVE) {
+		/* Skip exporting curves without faces, overhead can be
+		 * significant if there are many for path animation. */
+		BL::Curve b_curve(b_ob.data());
+
+		return (b_curve.bevel_object() ||
+		        b_curve.extrude() != 0.0f ||
+		        b_curve.bevel_depth() != 0.0f ||
+		        b_ob.modifiers.length());
+	}
+	else {
+		return (b_ob_data.is_a(&RNA_Mesh) ||
+		        b_ob_data.is_a(&RNA_Curve) ||
+		        b_ob_data.is_a(&RNA_MetaBall));
+	}
 }
 
 bool BlenderSync::object_is_light(BL::Object& b_ob)
@@ -268,6 +285,29 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		return NULL;
 	}
 
+	/* Visibility flags for both parent and child. */
+	bool use_holdout = (layer_flag & render_layer.holdout_layer) != 0;
+	uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY;
+
+	if(b_parent.ptr.data != b_ob.ptr.data) {
+		visibility &= object_ray_visibility(b_parent);
+	}
+
+	/* Make holdout objects on excluded layer invisible for non-camera rays. */
+	if(use_holdout && (layer_flag & render_layer.exclude_layer)) {
+		visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA);
+	}
+
+	/* Hide objects not on render layer from camera rays. */
+	if(!(layer_flag & render_layer.layer)) {
+		visibility &= ~PATH_RAY_CAMERA;
+	}
+
+	/* Don't export completely invisible objects. */
+	if(visibility == 0) {
+		return NULL;
+	}
+
 	/* key to lookup object */
 	ObjectKey key(b_parent, persistent_id, b_ob);
 	Object *object;
@@ -308,8 +348,6 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 	if(object_map.sync(&object, b_ob, b_parent, key))
 		object_updated = true;
 	
-	bool use_holdout = (layer_flag & render_layer.holdout_layer) != 0;
-	
 	/* mesh sync */
 	object->mesh = sync_mesh(b_ob, object_updated, hide_tris);
 
@@ -322,22 +360,6 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		object_updated = true;
 	}
 
-	/* visibility flags for both parent and child */
-	uint visibility = object_ray_visibility(b_ob) & PATH_RAY_ALL_VISIBILITY;
-	if(b_parent.ptr.data != b_ob.ptr.data) {
-		visibility &= object_ray_visibility(b_parent);
-	}
-
-	/* make holdout objects on excluded layer invisible for non-camera rays */
-	if(use_holdout && (layer_flag & render_layer.exclude_layer)) {
-		visibility &= ~(PATH_RAY_ALL_VISIBILITY - PATH_RAY_CAMERA);
-	}
-
-	/* hide objects not on render layer from camera rays */
-	if(!(layer_flag & render_layer.layer)) {
-		visibility &= ~PATH_RAY_CAMERA;
-	}
-
 	if(visibility != object->visibility) {
 		object->visibility = visibility;
 		object_updated = true;
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 54973fd1b7f..e268c9a0d35 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -60,6 +60,8 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	/* Backup some settings for comparison. */
 	DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
 	DebugFlags::OpenCL::KernelType opencl_kernel_type = flags.opencl.kernel_type;
+	/* Synchronize shared flags. */
+	flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type");
 	/* Synchronize CPU flags. */
 	flags.cpu.avx2 = get_boolean(cscene, "debug_use_cpu_avx2");
 	flags.cpu.avx = get_boolean(cscene, "debug_use_cpu_avx");
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 2b5dd5eadea..12de3da063f 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -129,9 +129,9 @@ void BlenderSession::create_session()
 	scene = new Scene(scene_params, session_params.device);
 
 	/* setup callbacks for builtin image support */
-	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7);
-	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4);
-	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4);
+	scene->image_manager->builtin_image_info_cb = function_bind(&BlenderSession::builtin_image_info, this, _1, _2, _3, _4, _5, _6, _7, _8);
+	scene->image_manager->builtin_image_pixels_cb = function_bind(&BlenderSession::builtin_image_pixels, this, _1, _2, _3, _4, _5);
+	scene->image_manager->builtin_image_float_pixels_cb = function_bind(&BlenderSession::builtin_image_float_pixels, this, _1, _2, _3, _4, _5);
 
 	/* create session */
 	session = new Session(session_params);
@@ -1013,7 +1013,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
                                         int &width,
                                         int &height,
                                         int &depth,
-                                        int &channels)
+                                        int &channels,
+                                        bool& free_cache)
 {
 	/* empty image */
 	is_float = false;
@@ -1021,6 +1022,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 	height = 1;
 	depth = 0;
 	channels = 0;
+	free_cache = false;
 
 	if(!builtin_data)
 		return;
@@ -1034,6 +1036,7 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 		/* image data */
 		BL::Image b_image(b_id);
 
+		free_cache = !b_image.has_data();
 		is_float = b_image.is_float();
 		width = b_image.size()[0];
 		height = b_image.size()[1];
@@ -1094,7 +1097,8 @@ void BlenderSession::builtin_image_info(const string &builtin_name,
 bool BlenderSession::builtin_image_pixels(const string &builtin_name,
                                           void *builtin_data,
                                           unsigned char *pixels,
-                                          const size_t pixels_size)
+                                          const size_t pixels_size,
+                                          const bool free_cache)
 {
 	if(!builtin_data) {
 		return false;
@@ -1115,7 +1119,6 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 
 	if(image_pixels && num_pixels * channels == pixels_size) {
 		memcpy(pixels, image_pixels, pixels_size * sizeof(unsigned char));
-		MEM_freeN(image_pixels);
 	}
 	else {
 		if(channels == 1) {
@@ -1134,6 +1137,16 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 			}
 		}
 	}
+
+	if(image_pixels) {
+		MEM_freeN(image_pixels);
+	}
+
+	/* Free image buffers to save memory during render. */
+	if(free_cache) {
+		b_image.buffers_free();
+	}
+
 	/* Premultiply, byte images are always straight for Blender. */
 	unsigned char *cp = pixels;
 	for(size_t i = 0; i < num_pixels; i++, cp += channels) {
@@ -1147,7 +1160,8 @@ bool BlenderSession::builtin_image_pixels(const string &builtin_name,
 bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
                                                 void *builtin_data,
                                                 float *pixels,
-                                                const size_t pixels_size)
+                                                const size_t pixels_size,
+                                                const bool free_cache)
 {
 	if(!builtin_data) {
 		return false;
@@ -1172,7 +1186,6 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 
 		if(image_pixels && num_pixels * channels == pixels_size) {
 			memcpy(pixels, image_pixels, pixels_size * sizeof(float));
-			MEM_freeN(image_pixels);
 		}
 		else {
 			if(channels == 1) {
@@ -1192,6 +1205,15 @@ bool BlenderSession::builtin_image_float_pixels(const string &builtin_name,
 			}
 		}
 
+		if(image_pixels) {
+			MEM_freeN(image_pixels);
+		}
+
+		/* Free image buffers to save memory during render. */
+		if(free_cache) {
+			b_image.buffers_free();
+		}
+
 		return true;
 	}
 	else if(b_id.is_a(&RNA_Object)) {
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 536808c5b18..cbd2303d282 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -156,15 +156,18 @@ protected:
 	                        int &width,
 	                        int &height,
 	                        int &depth,
-	                        int &channels);
+	                        int &channels,
+	                        bool &free_cache);
 	bool builtin_image_pixels(const string &builtin_name,
 	                          void *builtin_data,
 	                          unsigned char *pixels,
-	                          const size_t pixels_size);
+	                          const size_t pixels_size,
+	                          const bool free_cache);
 	bool builtin_image_float_pixels(const string &builtin_name,
 	                                void *builtin_data,
 	                                float *pixels,
-	                                const size_t pixels_size);
+	                                const size_t pixels_size,
+	                                const bool free_cache);
 
 	/* Update tile manager to reflect resumable render settings. */
 	void update_resumable_tile_manager(int num_samples);
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 3a00384458a..42e3721883f 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -234,7 +234,6 @@ void BlenderSync::sync_integrator()
 	Integrator *integrator = scene->integrator;
 	Integrator previntegrator = *integrator;
 
-	integrator->min_bounce = get_int(cscene, "min_bounces");
 	integrator->max_bounce = get_int(cscene, "max_bounces");
 
 	integrator->max_diffuse_bounce = get_int(cscene, "diffuse_bounces");
@@ -243,8 +242,6 @@ void BlenderSync::sync_integrator()
 	integrator->max_volume_bounce = get_int(cscene, "volume_bounces");
 
 	integrator->transparent_max_bounce = get_int(cscene, "transparent_max_bounces");
-	integrator->transparent_min_bounce = get_int(cscene, "transparent_min_bounces");
-	integrator->transparent_shadows = get_boolean(cscene, "use_transparent_shadows");
 
 	integrator->volume_max_steps = get_int(cscene, "volume_max_steps");
 	integrator->volume_step_size = get_float(cscene, "volume_step_size");
@@ -629,14 +626,10 @@ SceneParams BlenderSync::get_scene_params(BL::Scene& b_scene,
 	else if(shadingsystem == 1)
 		params.shadingsystem = SHADINGSYSTEM_OSL;
 	
-	if(background)
+	if(background || DebugFlags().viewport_static_bvh)
 		params.bvh_type = SceneParams::BVH_STATIC;
 	else
-		params.bvh_type = (SceneParams::BVHType)get_enum(
-		        cscene,
-		        "debug_bvh_type",
-		        SceneParams::BVH_NUM_TYPES,
-		        SceneParams::BVH_STATIC);
+		params.bvh_type = SceneParams::BVH_DYNAMIC;
 
 	params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
 	params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
@@ -810,6 +803,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	}
 
 	params.start_resolution = get_int(cscene, "preview_start_resolution");
+	params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
 
 	/* other parameters */
 	if(b_scene.render().threads_mode() == BL::RenderSettings::threads_mode_FIXED)
@@ -830,6 +824,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 			params.progressive = false;
 
 		params.start_resolution = INT_MAX;
+		params.pixel_size = 1;
 	}
 	else
 		params.progressive = true;
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index ebbf325f95b..363e19f7a20 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -51,8 +51,8 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
                                       bool calc_undeformed,
                                       Mesh::SubdivisionType subdivision_type)
 {
-	bool subsurf_mod_show_render;
-	bool subsurf_mod_show_viewport;
+	bool subsurf_mod_show_render = false;
+	bool subsurf_mod_show_viewport = false;
 
 	if(subdivision_type != Mesh::SUBDIVISION_NONE) {
 		BL::Modifier subsurf_mod = object.modifiers[object.modifiers.length()-1];
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 33143e2d8aa..0ad3c8a7429 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -153,7 +153,6 @@ void BVH::pack_primitives()
 		if(pack.prim_index[i] != -1) {
 			int tob = pack.prim_object[i];
 			Object *ob = objects[tob];
-
 			if((pack.prim_type[i] & PRIMITIVE_ALL_TRIANGLE) != 0) {
 				pack_triangle(i, (float4*)&pack.prim_tri_verts[3 * prim_triangle_index]);
 				pack.prim_tri_index[i] = 3 * prim_triangle_index;
@@ -162,11 +161,10 @@ void BVH::pack_primitives()
 			else {
 				pack.prim_tri_index[i] = -1;
 			}
-
-			pack.prim_visibility[i] = ob->visibility;
-
-			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE)
+			pack.prim_visibility[i] = ob->visibility_for_tracing();
+			if(pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
 				pack.prim_visibility[i] |= PATH_RAY_CURVE;
+			}
 		}
 		else {
 			pack.prim_tri_index[i] = -1;
diff --git a/intern/cycles/bvh/bvh2.cpp b/intern/cycles/bvh/bvh2.cpp
index 340ba7dcf53..9aa8e71dfd0 100644
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -312,10 +312,8 @@ void BVH2::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 					}
 				}
 			}
-
-			visibility |= ob->visibility;
+			visibility |= ob->visibility_for_tracing();
 		}
-
 		/* TODO(sergey): De-duplicate with pack_leaf(). */
 		float4 leaf_data[BVH_NODE_LEAF_SIZE];
 		leaf_data[0].x = __int_as_float(c0);
diff --git a/intern/cycles/bvh/bvh4.cpp b/intern/cycles/bvh/bvh4.cpp
index 5034ab811d5..777de20423b 100644
--- a/intern/cycles/bvh/bvh4.cpp
+++ b/intern/cycles/bvh/bvh4.cpp
@@ -242,21 +242,21 @@ void BVH4::pack_unaligned_node(int idx,
 		 * so kernel might safely assume there are always 4 child nodes.
 		 */
 
-		data[1][i] = 1.0f;
-		data[2][i] = 0.0f;
-		data[3][i] = 0.0f;
+		data[1][i] = NAN;
+		data[2][i] = NAN;
+		data[3][i] = NAN;
 
-		data[4][i] = 0.0f;
-		data[5][i] = 0.0f;
-		data[6][i] = 0.0f;
+		data[4][i] = NAN;
+		data[5][i] = NAN;
+		data[6][i] = NAN;
 
-		data[7][i] = 0.0f;
-		data[8][i] = 0.0f;
-		data[9][i] = 0.0f;
+		data[7][i] = NAN;
+		data[8][i] = NAN;
+		data[9][i] = NAN;
 
-		data[10][i] = -FLT_MAX;
-		data[11][i] = -FLT_MAX;
-		data[12][i] = -FLT_MAX;
+		data[10][i] = NAN;
+		data[11][i] = NAN;
+		data[12][i] = NAN;
 
 		data[13][i] = __int_as_float(0);
 	}
@@ -438,10 +438,8 @@ void BVH4::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 					}
 				}
 			}
-
-			visibility |= ob->visibility;
+			visibility |= ob->visibility_for_tracing();
 		}
-
 		/* TODO(sergey): This is actually a copy of pack_leaf(),
 		 * but this chunk of code only knows actual data and has
 		 * no idea about BVHNode.
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 1880964355c..649ce52da05 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -529,7 +529,9 @@ BVHNode* BVHBuild::run()
 			        << "  Allocation slop factor: "
 			               << ((prim_type.capacity() != 0)
 			                       ? (float)prim_type.size() / prim_type.capacity()
-			                       : 1.0f) << "\n";
+			                       : 1.0f) << "\n"
+			        << "  Maximum depth: "
+			        << string_human_readable_number(rootnode->getSubtreeSize(BVH_STAT_DEPTH))  << "\n";
 		}
 	}
 
@@ -671,7 +673,7 @@ BVHNode* BVHBuild::build_node(const BVHObjectBinning& range, int level)
 				return create_leaf_node(range, references);
 			}
 		}
-		/* Check whether unaligned split is better than the regulat one. */
+		/* Check whether unaligned split is better than the regular one. */
 		if(unalignedSplitSAH < splitSAH) {
 			do_unalinged_split = true;
 		}
@@ -865,7 +867,7 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 			prim_time[start] = make_float2(ref->time_from(), ref->time_to());
 		}
 
-		uint visibility = objects[ref->prim_object()]->visibility;
+		const uint visibility = objects[ref->prim_object()]->visibility_for_tracing();
 		BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
 		leaf_node->time_from = ref->time_from();
 		leaf_node->time_to = ref->time_to();
@@ -939,7 +941,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			                                         ref.time_to()));
 
 			bounds[type_index].grow(ref.bounds());
-			visibility[type_index] |= objects[ref.prim_object()]->visibility;
+			visibility[type_index] |= objects[ref.prim_object()]->visibility_for_tracing();
 			if(ref.prim_type() & PRIMITIVE_ALL_CURVE) {
 				visibility[type_index] |= PATH_RAY_CURVE;
 			}
@@ -1040,7 +1042,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		 */
 		start_index = spatial_free_index;
 		spatial_free_index += range.size();
-
 		/* Extend an array when needed. */
 		const size_t range_end = start_index + range.size();
 		if(prim_type.size() < range_end) {
@@ -1066,8 +1067,6 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				prim_time.resize(range_end);
 			}
 		}
-		spatial_spin_lock.unlock();
-
 		/* Perform actual data copy. */
 		if(new_leaf_data_size > 0) {
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
@@ -1077,6 +1076,7 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
 			}
 		}
+		spatial_spin_lock.unlock();
 	}
 	else {
 		/* For the regular BVH builder we simply copy new data starting at the
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 4237c62ab5b..ab6df4d265d 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -132,6 +132,17 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 		case BVH_STAT_UNALIGNED_LEAF_COUNT:
 			cnt = (is_leaf() && is_unaligned) ? 1 : 0;
 			break;
+		case BVH_STAT_DEPTH:
+			if(is_leaf()) {
+				cnt = 1;
+			}
+			else {
+				for(int i = 0; i < num_children(); i++) {
+					cnt = max(cnt, get_child(i)->getSubtreeSize(stat));
+				}
+				cnt += 1;
+			}
+			return cnt;
 		default:
 			assert(0); /* unknown mode */
 	}
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 1c875f5a524..94cf5ab730c 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -38,6 +38,7 @@ enum BVH_STAT {
 	BVH_STAT_UNALIGNED_INNER_QNODE_COUNT,
 	BVH_STAT_ALIGNED_LEAF_COUNT,
 	BVH_STAT_UNALIGNED_LEAF_COUNT,
+	BVH_STAT_DEPTH,
 };
 
 class BVHParams;
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 74ec57ddf74..3c632160fbd 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -34,11 +34,13 @@ set(SRC
 
 set(SRC_OPENCL
 	opencl/opencl.h
+	opencl/memory_manager.h
 
 	opencl/opencl_base.cpp
 	opencl/opencl_mega.cpp
 	opencl/opencl_split.cpp
 	opencl/opencl_util.cpp
+	opencl/memory_manager.cpp
 )
 
 if(WITH_CYCLES_NETWORK)
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index a54bb77f9f3..f64436aec7b 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -379,11 +379,9 @@ DeviceInfo Device::get_multi_device(vector<DeviceInfo> subdevices)
 	info.num = 0;
 
 	info.has_bindless_textures = true;
-	info.pack_images = false;
 	foreach(DeviceInfo &device, subdevices) {
 		assert(device.type == info.multi_devices[0].type);
 
-		info.pack_images |= device.pack_images;
 		info.has_bindless_textures &= device.has_bindless_textures;
 	}
 
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index b3b693c630c..26d6d380a10 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -53,7 +53,6 @@ public:
 	int num;
 	bool display_device;
 	bool advanced_shading;
-	bool pack_images;
 	bool has_bindless_textures; /* flag for GPU and Multi device */
 	bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
 	vector<DeviceInfo> multi_devices;
@@ -65,7 +64,6 @@ public:
 		num = 0;
 		display_device = false;
 		advanced_shading = true;
-		pack_images = false;
 		has_bindless_textures = false;
 		use_split_kernel = false;
 	}
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 18112437b45..6a1106328fb 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -48,6 +48,7 @@
 #include "util/util_logging.h"
 #include "util/util_map.h"
 #include "util/util_opengl.h"
+#include "util/util_optimization.h"
 #include "util/util_progress.h"
 #include "util/util_system.h"
 #include "util/util_thread.h"
@@ -119,7 +120,7 @@ public:
 		}
 #endif
 
-		if(strstr(architecture_name, logged_architecture) != 0) {
+		if(strcmp(architecture_name, logged_architecture) != 0) {
 			VLOG(1) << "Will be using " << architecture_name << " kernels.";
 			logged_architecture = architecture_name;
 		}
@@ -976,7 +977,6 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.id = "CPU";
 	info.num = 0;
 	info.advanced_shading = true;
-	info.pack_images = false;
 
 	devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 3a29538aa13..29b5bd70789 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -111,6 +111,16 @@ public:
 	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
 };
 
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+public:
+	CUDAContextScope(CUDADevice *device);
+	~CUDAContextScope();
+
+private:
+	CUDADevice *device;
+};
+
 class CUDADevice : public Device
 {
 public:
@@ -206,16 +216,6 @@ public:
 		cuda_error_documentation();
 	}
 
-	void cuda_push_context()
-	{
-		cuda_assert(cuCtxSetCurrent(cuContext));
-	}
-
-	void cuda_pop_context()
-	{
-		cuda_assert(cuCtxSetCurrent(NULL));
-	}
-
 	CUDADevice(DeviceInfo& info, Stats &stats, bool background_)
 	: Device(info, stats, background_)
 	{
@@ -263,7 +263,8 @@ public:
 		cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
 		cuDevArchitecture = major*100 + minor*10;
 
-		cuda_pop_context();
+		/* Pop context set by cuCtxCreate. */
+		cuCtxPopCurrent(NULL);
 	}
 
 	~CUDADevice()
@@ -519,7 +520,7 @@ public:
 			return false;
 
 		/* open module */
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		string cubin_data;
 		CUresult result;
@@ -540,8 +541,6 @@ public:
 		if(cuda_error_(result, "cuModuleLoad"))
 			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
 
-		cuda_pop_context();
-
 		return (result == CUDA_SUCCESS);
 	}
 
@@ -556,36 +555,36 @@ public:
 
 	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		CUDAContextScope scope(this);
+
 		if(name) {
 			VLOG(1) << "Buffer allocate: " << name << ", "
 			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 			        << string_human_readable_size(mem.memory_size()) << ")";
 		}
 
-		cuda_push_context();
 		CUdeviceptr device_pointer;
 		size_t size = mem.memory_size();
 		cuda_assert(cuMemAlloc(&device_pointer, size));
 		mem.device_pointer = (device_ptr)device_pointer;
 		mem.device_size = size;
 		stats.mem_alloc(size);
-		cuda_pop_context();
 	}
 
 	void mem_copy_to(device_memory& mem)
 	{
-		cuda_push_context();
+		CUDAContextScope scope(this);
+
 		if(mem.device_pointer)
 			cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()));
-		cuda_pop_context();
 	}
 
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
 	{
+		CUDAContextScope scope(this);
 		size_t offset = elem*y*w;
 		size_t size = elem*w*h;
 
-		cuda_push_context();
 		if(mem.device_pointer) {
 			cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
 			                         (CUdeviceptr)(mem.device_pointer + offset), size));
@@ -593,7 +592,6 @@ public:
 		else {
 			memset((char*)mem.data_pointer + offset, 0, size);
 		}
-		cuda_pop_context();
 	}
 
 	void mem_zero(device_memory& mem)
@@ -602,18 +600,17 @@ public:
 			memset((void*)mem.data_pointer, 0, mem.memory_size());
 		}
 
-		cuda_push_context();
-		if(mem.device_pointer)
+		if(mem.device_pointer) {
+			CUDAContextScope scope(this);
 			cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
-		cuda_pop_context();
+		}
 	}
 
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
-			cuda_push_context();
+			CUDAContextScope scope(this);
 			cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)));
-			cuda_pop_context();
 
 			mem.device_pointer = 0;
 
@@ -629,14 +626,13 @@ public:
 
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
+		CUDAContextScope scope(this);
 		CUdeviceptr mem;
 		size_t bytes;
 
-		cuda_push_context();
 		cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
 		//assert(bytes == size);
 		cuda_assert(cuMemcpyHtoD(mem, host, size));
-		cuda_pop_context();
 	}
 
 	void tex_alloc(const char *name,
@@ -644,6 +640,8 @@ public:
 	               InterpolationType interpolation,
 	               ExtensionType extension)
 	{
+		CUDAContextScope scope(this);
+
 		VLOG(1) << "Texture allocate: " << name << ", "
 		        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 		        << string_human_readable_size(mem.memory_size()) << ")";
@@ -706,9 +704,7 @@ public:
 				                          tokens[3].c_str());
 			}
 
-			cuda_push_context();
 			cuda_assert(cuModuleGetTexRef(&texref, cuModule, bind_name.c_str()));
-			cuda_pop_context();
 
 			if(!texref) {
 				return;
@@ -721,8 +717,6 @@ public:
 				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
-				cuda_push_context();
-
 				CUdeviceptr cumem;
 				size_t cubytes;
 
@@ -738,28 +732,20 @@ public:
 					uint32_t ptr = (uint32_t)mem.device_pointer;
 					cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
 				}
-
-				cuda_pop_context();
 			}
 			else {
 				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
-				cuda_push_context();
-
 				cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size));
 				cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT));
 				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER));
-
-				cuda_pop_context();
 			}
 		}
 		/* Texture Storage */
 		else {
 			CUarray handle = NULL;
 
-			cuda_push_context();
-
 			if(mem.data_depth > 1) {
 				CUDA_ARRAY3D_DESCRIPTOR desc;
 
@@ -784,7 +770,6 @@ public:
 			}
 
 			if(!handle) {
-				cuda_pop_context();
 				return;
 			}
 
@@ -877,14 +862,10 @@ public:
 				cuda_assert(cuTexRefSetFilterMode(texref, filter_mode));
 				cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES));
 			}
-
-			cuda_pop_context();
 		}
 
 		/* Fermi, Data and Image Textures */
 		if(!has_bindless_textures) {
-			cuda_push_context();
-
 			cuda_assert(cuTexRefSetAddressMode(texref, 0, address_mode));
 			cuda_assert(cuTexRefSetAddressMode(texref, 1, address_mode));
 			if(mem.data_depth > 1) {
@@ -892,8 +873,6 @@ public:
 			}
 
 			cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements));
-
-			cuda_pop_context();
 		}
 
 		/* Fermi and Kepler */
@@ -904,9 +883,8 @@ public:
 	{
 		if(mem.device_pointer) {
 			if(tex_interp_map[mem.device_pointer]) {
-				cuda_push_context();
+				CUDAContextScope scope(this);
 				cuArrayDestroy((CUarray)mem.device_pointer);
-				cuda_pop_context();
 
 				/* Free CUtexObject (Bindless Textures) */
 				if(info.has_bindless_textures && tex_bindless_map[mem.device_pointer]) {
@@ -960,7 +938,7 @@ public:
 		if(have_error())
 			return false;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		int4 rect = task->rect;
 		int w = align_up(rect.z-rect.x, 4);
@@ -1017,7 +995,6 @@ public:
 		CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
 		cuda_assert(cuCtxSynchronize());
 
-		cuda_pop_context();
 		return !have_error();
 	}
 
@@ -1026,7 +1003,7 @@ public:
 		if(have_error())
 			return false;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuFilterConstructTransform;
 		cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
@@ -1046,7 +1023,6 @@ public:
 		CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
 		cuda_assert(cuCtxSynchronize());
 
-		cuda_pop_context();
 		return !have_error();
 	}
 
@@ -1058,11 +1034,11 @@ public:
 		if(have_error())
 			return false;
 
+		CUDAContextScope scope(this);
+
 		mem_zero(task->storage.XtWX);
 		mem_zero(task->storage.XtWY);
 
-		cuda_push_context();
-
 		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
 		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
 		cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
@@ -1150,7 +1126,6 @@ public:
 		CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
 		cuda_assert(cuCtxSynchronize());
 
-		cuda_pop_context();
 		return !have_error();
 	}
 
@@ -1161,7 +1136,7 @@ public:
 		if(have_error())
 			return false;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuFilterCombineHalves;
 		cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
@@ -1179,7 +1154,6 @@ public:
 		CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
 		cuda_assert(cuCtxSynchronize());
 
-		cuda_pop_context();
 		return !have_error();
 	}
 
@@ -1190,7 +1164,7 @@ public:
 		if(have_error())
 			return false;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuFilterDivideShadow;
 		cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
@@ -1214,7 +1188,6 @@ public:
 		CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
 		cuda_assert(cuCtxSynchronize());
 
-		cuda_pop_context();
 		return !have_error();
 	}
 
@@ -1227,7 +1200,7 @@ public:
 		if(have_error())
 			return false;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuFilterGetFeature;
 		cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
@@ -1250,7 +1223,6 @@ public:
 		CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
 		cuda_assert(cuCtxSynchronize());
 
-		cuda_pop_context();
 		return !have_error();
 	}
 
@@ -1263,7 +1235,7 @@ public:
 		if(have_error())
 			return false;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuFilterDetectOutliers;
 		cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
@@ -1282,7 +1254,6 @@ public:
 		CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
 		cuda_assert(cuCtxSynchronize());
 
-		cuda_pop_context();
 		return !have_error();
 	}
 
@@ -1319,7 +1290,7 @@ public:
 		if(have_error())
 			return;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuPathTrace;
 		CUdeviceptr d_buffer = cuda_device_ptr(rtile.buffer);
@@ -1333,8 +1304,9 @@ public:
 			cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
 		}
 
-		if(have_error())
+		if(have_error()) {
 			return;
+		}
 
 		/* pass in parameters */
 		void *args[] = {&d_buffer,
@@ -1370,8 +1342,6 @@ public:
 		                           0, 0, args, 0));
 
 		cuda_assert(cuCtxSynchronize());
-
-		cuda_pop_context();
 	}
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
@@ -1379,7 +1349,7 @@ public:
 		if(have_error())
 			return;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuFilmConvert;
 		CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
@@ -1424,8 +1394,6 @@ public:
 		                           0, 0, args, 0));
 
 		unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
-
-		cuda_pop_context();
 	}
 
 	void shader(DeviceTask& task)
@@ -1433,7 +1401,7 @@ public:
 		if(have_error())
 			return;
 
-		cuda_push_context();
+		CUDAContextScope scope(this);
 
 		CUfunction cuShader;
 		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
@@ -1498,8 +1466,6 @@ public:
 
 			task.update_progress(NULL);
 		}
-
-		cuda_pop_context();
 	}
 
 	CUdeviceptr map_pixels(device_ptr mem)
@@ -1535,7 +1501,7 @@ public:
 			pmem.w = mem.data_width;
 			pmem.h = mem.data_height;
 
-			cuda_push_context();
+			CUDAContextScope scope(this);
 
 			glGenBuffers(1, &pmem.cuPBO);
 			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
@@ -1559,8 +1525,6 @@ public:
 			CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
 
 			if(result == CUDA_SUCCESS) {
-				cuda_pop_context();
-
 				mem.device_pointer = pmem.cuTexId;
 				pixel_mem_map[mem.device_pointer] = pmem;
 
@@ -1574,8 +1538,6 @@ public:
 				glDeleteBuffers(1, &pmem.cuPBO);
 				glDeleteTextures(1, &pmem.cuTexId);
 
-				cuda_pop_context();
-
 				background = true;
 			}
 		}
@@ -1588,7 +1550,7 @@ public:
 		if(!background) {
 			PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
-			cuda_push_context();
+			CUDAContextScope scope(this);
 
 			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
 			uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
@@ -1597,8 +1559,6 @@ public:
 			glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
 			glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
-			cuda_pop_context();
-
 			return;
 		}
 
@@ -1611,14 +1571,12 @@ public:
 			if(!background) {
 				PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
-				cuda_push_context();
+				CUDAContextScope scope(this);
 
 				cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
 				glDeleteBuffers(1, &pmem.cuPBO);
 				glDeleteTextures(1, &pmem.cuTexId);
 
-				cuda_pop_context();
-
 				pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
 				mem.device_pointer = 0;
 
@@ -1639,7 +1597,7 @@ public:
 			PixelMem pmem = pixel_mem_map[mem.device_pointer];
 			float *vpointer;
 
-			cuda_push_context();
+			CUDAContextScope scope(this);
 
 			/* for multi devices, this assumes the inefficient method that we allocate
 			 * all pixels on the device even though we only render to a subset */
@@ -1728,8 +1686,6 @@ public:
 			glBindTexture(GL_TEXTURE_2D, 0);
 			glDisable(GL_TEXTURE_2D);
 
-			cuda_pop_context();
-
 			return;
 		}
 
@@ -1738,6 +1694,8 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
+		CUDAContextScope scope(this);
+
 		if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 
@@ -1805,9 +1763,7 @@ public:
 
 			shader(*task);
 
-			cuda_push_context();
 			cuda_assert(cuCtxSynchronize());
-			cuda_pop_context();
 		}
 	}
 
@@ -1828,12 +1784,11 @@ public:
 	void task_add(DeviceTask& task)
 	{
 		if(task.type == DeviceTask::FILM_CONVERT) {
+			CUDAContextScope scope(this);
+
 			/* must be done in main thread due to opengl access */
 			film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-
-			cuda_push_context();
 			cuda_assert(cuCtxSynchronize());
-			cuda_pop_context();
 		}
 		else {
 			task_pool.push(new CUDADeviceTask(this, task));
@@ -1852,6 +1807,7 @@ public:
 
 	friend class CUDASplitKernelFunction;
 	friend class CUDASplitKernel;
+	friend class CUDAContextScope;
 };
 
 /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
@@ -1872,6 +1828,20 @@ public:
 		} \
 	} (void)0
 
+
+/* CUDA context scope. */
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device)
+: device(device)
+{
+	cuda_assert(cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+	cuda_assert(cuCtxPopCurrent(NULL));
+}
+
 /* split kernel */
 
 class CUDASplitKernelFunction : public SplitKernelFunction{
@@ -1889,30 +1859,24 @@ public:
 	/* enqueue the kernel, returns false if there is an error */
 	bool enqueue(const KernelDimensions &dim, void *args[])
 	{
-		device->cuda_push_context();
-
 		if(device->have_error())
 			return false;
 
+		CUDAContextScope scope(device);
+
 		/* we ignore dim.local_size for now, as this is faster */
 		int threads_per_block;
 		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
 
-		int xthreads = (int)sqrt(threads_per_block);
-		int ythreads = (int)sqrt(threads_per_block);
-
-		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
-		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+		int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
 
 		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
 
 		cuda_assert(cuLaunchKernel(func,
-		                           xblocks , yblocks, 1, /* blocks */
-		                           xthreads, ythreads, 1, /* threads */
+		                           xblocks, 1, 1, /* blocks */
+		                           threads_per_block, 1, 1, /* threads */
 		                           0, 0, args, 0));
 
-		device->cuda_pop_context();
-
 		return !device->have_error();
 	}
 };
@@ -1923,12 +1887,12 @@ CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device)
 
 uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
 {
+	CUDAContextScope scope(device);
+
 	device_vector<uint64_t> size_buffer;
 	size_buffer.resize(1);
 	device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
 
-	device->cuda_push_context();
-
 	uint threads = num_threads;
 	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
 
@@ -1950,8 +1914,6 @@ uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory
 	                           1, 1, 1,
 	                           0, 0, (void**)&args, 0));
 
-	device->cuda_pop_context();
-
 	device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
 	device->mem_free(size_buffer);
 
@@ -1969,7 +1931,7 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim
                                     device_memory& use_queues_flag,
                                     device_memory& work_pool_wgs)
 {
-	device->cuda_push_context();
+	CUDAContextScope scope(device);
 
 	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
 	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
@@ -2033,26 +1995,21 @@ bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim
 
 	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
 
-	device->cuda_pop_context();
-
 	return !device->have_error();
 }
 
 SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
                                                                 const DeviceRequestedFeatures&)
 {
+	CUDAContextScope scope(device);
 	CUfunction func;
 
-	device->cuda_push_context();
-
 	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
 	if(device->have_error()) {
 		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
 		return NULL;
 	}
 
-	device->cuda_pop_context();
-
 	return new CUDASplitKernelFunction(device, func);
 }
 
@@ -2063,12 +2020,11 @@ int2 CUDASplitKernel::split_kernel_local_size()
 
 int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
 {
+	CUDAContextScope scope(device);
 	size_t free;
 	size_t total;
 
-	device->cuda_push_context();
 	cuda_assert(cuMemGetInfo(&free, &total));
-	device->cuda_pop_context();
 
 	VLOG(1) << "Maximum device allocation size: "
 	        << string_human_readable_number(free) << " bytes. ("
@@ -2127,18 +2083,34 @@ Device *device_cuda_create(DeviceInfo& info, Stats &stats, bool background)
 	return new CUDADevice(info, stats, background);
 }
 
-void device_cuda_info(vector<DeviceInfo>& devices)
+static CUresult device_cuda_safe_init()
 {
-	CUresult result;
-	int count = 0;
+#ifdef _WIN32
+	__try {
+		return cuInit(0);
+	}
+	__except(EXCEPTION_EXECUTE_HANDLER) {
+		/* Ignore crashes inside the CUDA driver and hope we can
+		 * survive even with corrupted CUDA installs. */
+		fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
+	}
 
-	result = cuInit(0);
+	return CUDA_ERROR_NO_DEVICE;
+#else
+	return cuInit(0);
+#endif
+}
+
+void device_cuda_info(vector<DeviceInfo>& devices)
+{
+	CUresult result = device_cuda_safe_init();
 	if(result != CUDA_SUCCESS) {
 		if(result != CUDA_ERROR_NO_DEVICE)
 			fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
 		return;
 	}
 
+	int count = 0;
 	result = cuDeviceGetCount(&count);
 	if(result != CUDA_SUCCESS) {
 		fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
@@ -2168,7 +2140,6 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 		info.advanced_shading = (major >= 2);
 		info.has_bindless_textures = (major >= 3);
-		info.pack_images = false;
 
 		int pci_location[3] = {0, 0, 0};
 		cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
@@ -2196,7 +2167,7 @@ void device_cuda_info(vector<DeviceInfo>& devices)
 
 string device_cuda_capabilities(void)
 {
-	CUresult result = cuInit(0);
+	CUresult result = device_cuda_safe_init();
 	if(result != CUDA_SUCCESS) {
 		if(result != CUDA_ERROR_NO_DEVICE) {
 			return string("Error initializing CUDA: ") + cuewErrorString(result);
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 66758954f44..571ba9465ca 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -344,7 +344,6 @@ void device_network_info(vector<DeviceInfo>& devices)
 	info.id = "NETWORK";
 	info.num = 0;
 	info.advanced_shading = true; /* todo: get this info from device */
-	info.pack_images = false;
 
 	devices.push_back(info);
 }
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 681b8214b03..9d89decaaaf 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -73,8 +73,34 @@ bool device_opencl_init(void)
 	return result;
 }
 
+
+static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
+{
+#ifdef _WIN32
+	__try {
+		return clGetPlatformIDs(0, NULL, num_platforms);
+	}
+	__except(EXCEPTION_EXECUTE_HANDLER) {
+		/* Ignore crashes inside the OpenCL driver and hope we can
+		 * survive even with corrupted OpenCL installs. */
+		fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
+	}
+
+	*num_platforms = 0;
+	return CL_DEVICE_NOT_FOUND;
+#else
+	return clGetPlatformIDs(0, NULL, num_platforms);
+#endif
+}
+
 void device_opencl_info(vector<DeviceInfo>& devices)
 {
+	cl_uint num_platforms = 0;
+	device_opencl_get_num_platforms_safe(&num_platforms);
+	if(num_platforms == 0) {
+		return;
+	}
+
 	vector<OpenCLPlatformDevice> usable_devices;
 	OpenCLInfo::get_usable_devices(&usable_devices);
 	/* Devices are numbered consecutively across platforms. */
@@ -95,7 +121,6 @@ void device_opencl_info(vector<DeviceInfo>& devices)
 		/* We don't know if it's used for display, but assume it is. */
 		info.display_device = true;
 		info.advanced_shading = OpenCLInfo::kernel_use_advanced_shading(platform_name);
-		info.pack_images = true;
 		info.use_split_kernel = OpenCLInfo::kernel_use_split(platform_name,
 		                                                     device_type);
 		info.id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
@@ -114,7 +139,7 @@ string device_opencl_capabilities(void)
 	                         * it could also be nicely reported to the console.
 	                         */
 	cl_uint num_platforms = 0;
-	opencl_assert(clGetPlatformIDs(0, NULL, &num_platforms));
+	opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
 	if(num_platforms == 0) {
 		return "No OpenCL platforms found\n";
 	}
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
new file mode 100644
index 00000000000..b67dfef88aa
--- /dev/null
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPENCL
+
+#include "util/util_foreach.h"
+
+#include "device/opencl/opencl.h"
+#include "device/opencl/memory_manager.h"
+
+CCL_NAMESPACE_BEGIN
+
+void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation)
+{
+	allocations.push_back(&allocation);
+}
+
+void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDeviceBase *device)
+{
+	bool need_realloc = false;
+
+	/* Calculate total size and remove any freed. */
+	size_t total_size = 0;
+
+	for(int i = allocations.size()-1; i >= 0; i--) {
+		Allocation* allocation = allocations[i];
+
+		/* Remove allocations that have been freed. */
+		if(!allocation->mem || allocation->mem->memory_size() == 0) {
+			allocation->device_buffer = NULL;
+			allocation->size = 0;
+
+			allocations.erase(allocations.begin()+i);
+
+			need_realloc = true;
+
+			continue;
+		}
+
+		/* Get actual size for allocation. */
+		size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
+
+		if(allocation->size != alloc_size) {
+			/* Allocation is either new or resized. */
+			allocation->size = alloc_size;
+			allocation->needs_copy_to_device = true;
+
+			need_realloc = true;
+		}
+
+		total_size += alloc_size;
+	}
+
+	if(need_realloc) {
+		cl_ulong max_buffer_size;
+		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+		if(total_size > max_buffer_size) {
+			device->set_error("Scene too complex to fit in available memory.");
+			return;
+		}
+
+		device_memory *new_buffer = new device_memory;
+
+		new_buffer->resize(total_size);
+		device->mem_alloc(string_printf("buffer_%p", this).data(), *new_buffer, MEM_READ_ONLY);
+
+		size_t offset = 0;
+
+		foreach(Allocation* allocation, allocations) {
+			if(allocation->needs_copy_to_device) {
+				/* Copy from host to device. */
+				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(new_buffer->device_pointer),
+					CL_FALSE,
+					offset,
+					allocation->mem->memory_size(),
+					(void*)allocation->mem->data_pointer,
+					0, NULL, NULL
+				));
+
+				allocation->needs_copy_to_device = false;
+			}
+			else {
+				/* Fast copy from memory already on device. */
+				opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(buffer->device_pointer),
+					CL_MEM_PTR(new_buffer->device_pointer),
+					allocation->desc.offset,
+					offset,
+					allocation->mem->memory_size(),
+					0, NULL, NULL
+				));
+			}
+
+			allocation->desc.offset = offset;
+			offset += allocation->size;
+		}
+
+		device->mem_free(*buffer);
+		delete buffer;
+
+		buffer = new_buffer;
+	}
+	else {
+		assert(total_size == buffer->data_size);
+
+		size_t offset = 0;
+
+		foreach(Allocation* allocation, allocations) {
+			if(allocation->needs_copy_to_device) {
+				/* Copy from host to device. */
+				opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
+					CL_MEM_PTR(buffer->device_pointer),
+					CL_FALSE,
+					offset,
+					allocation->mem->memory_size(),
+					(void*)allocation->mem->data_pointer,
+					0, NULL, NULL
+				));
+
+				allocation->needs_copy_to_device = false;
+			}
+
+			offset += allocation->size;
+		}
+	}
+
+	/* Not really necessary, but seems to improve responsiveness for some reason. */
+	clFinish(device->cqCommandQueue);
+}
+
+void MemoryManager::DeviceBuffer::free(OpenCLDeviceBase *device)
+{
+	device->mem_free(*buffer);
+}
+
+MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer()
+{
+	DeviceBuffer* smallest = device_buffers;
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		if(device_buffer.size < smallest->size) {
+			smallest = &device_buffer;
+		}
+	}
+
+	return smallest;
+}
+
+MemoryManager::MemoryManager(OpenCLDeviceBase *device) : device(device), need_update(false)
+{
+}
+
+void MemoryManager::free()
+{
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.free(device);
+	}
+}
+
+void MemoryManager::alloc(const char *name, device_memory& mem)
+{
+	Allocation& allocation = allocations[name];
+
+	allocation.mem = &mem;
+	allocation.needs_copy_to_device = true;
+
+	if(!allocation.device_buffer) {
+		DeviceBuffer* device_buffer = smallest_device_buffer();
+		allocation.device_buffer = device_buffer;
+
+		allocation.desc.device_buffer = device_buffer - device_buffers;
+
+		device_buffer->add_allocation(allocation);
+
+		device_buffer->size += mem.memory_size();
+	}
+
+	need_update = true;
+}
+
+bool MemoryManager::free(device_memory& mem)
+{
+	foreach(AllocationsMap::value_type& value, allocations) {
+		Allocation& allocation = value.second;
+		if(allocation.mem == &mem) {
+
+			allocation.device_buffer->size -= mem.memory_size();
+
+			allocation.mem = NULL;
+			allocation.needs_copy_to_device = false;
+
+			need_update = true;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
+{
+	update_device_memory();
+
+	Allocation& allocation = allocations[name];
+	return allocation.desc;
+}
+
+void MemoryManager::update_device_memory()
+{
+	if(!need_update) {
+		return;
+	}
+
+	need_update = false;
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		device_buffer.update_device_memory(device);
+	}
+}
+
+void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
+{
+	update_device_memory();
+
+	foreach(DeviceBuffer& device_buffer, device_buffers) {
+		if(device_buffer.buffer->device_pointer) {
+			device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
+		}
+		else {
+			device->kernel_set_args(kernel, (*narg)++, device->null_mem);
+		}
+	}
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* WITH_OPENCL */
+
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
new file mode 100644
index 00000000000..3714405d026
--- /dev/null
+++ b/intern/cycles/device/opencl/memory_manager.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device.h"
+
+#include "util/util_map.h"
+#include "util/util_vector.h"
+#include "util/util_string.h"
+
+#include "clew.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OpenCLDeviceBase;
+
+class MemoryManager {
+public:
+	static const int NUM_DEVICE_BUFFERS = 8;
+
+	struct BufferDescriptor {
+		uint device_buffer;
+		cl_ulong offset;
+	};
+
+private:
+	struct DeviceBuffer;
+
+	struct Allocation {
+		device_memory *mem;
+
+		DeviceBuffer *device_buffer;
+		size_t size; /* Size of actual allocation, may be larger than requested. */
+
+		BufferDescriptor desc;
+
+		bool needs_copy_to_device;
+
+		Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
+		{
+		}
+	};
+
+	struct DeviceBuffer {
+		device_memory *buffer;
+		vector<Allocation*> allocations;
+		size_t size; /* Size of all allocations. */
+
+		DeviceBuffer() : buffer(new device_memory), size(0)
+		{
+		}
+
+		~DeviceBuffer() {
+			delete buffer;
+			buffer = NULL;
+		}
+
+		void add_allocation(Allocation& allocation);
+
+		void update_device_memory(OpenCLDeviceBase *device);
+
+		void free(OpenCLDeviceBase *device);
+	};
+
+	OpenCLDeviceBase *device;
+
+	DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
+
+	typedef unordered_map<string, Allocation> AllocationsMap;
+	AllocationsMap allocations;
+
+	bool need_update;
+
+	DeviceBuffer* smallest_device_buffer();
+
+public:
+	MemoryManager(OpenCLDeviceBase *device);
+
+	void free(); /* Free all memory. */
+
+	void alloc(const char *name, device_memory& mem);
+	bool free(device_memory& mem);
+
+	BufferDescriptor get_descriptor(string name);
+
+	void update_device_memory();
+	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
+};
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 78ca377d933..26bf4a9af5b 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -25,6 +25,8 @@
 
 #include "clew.h"
 
+#include "device/opencl/memory_manager.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Disable workarounds, seems to be working fine on latest drivers. */
@@ -224,6 +226,18 @@ public:
 	static string get_kernel_md5();
 };
 
+#define opencl_device_assert(device, stmt) \
+	{ \
+		cl_int err = stmt; \
+		\
+		if(err != CL_SUCCESS) { \
+			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
+			if((device)->error_message() == "") \
+				(device)->set_error(message); \
+			fprintf(stderr, "%s\n", message.c_str()); \
+		} \
+	} (void)0
+
 #define opencl_assert(stmt) \
 	{ \
 		cl_int err = stmt; \
@@ -344,6 +358,7 @@ public:
 	size_t global_size_round_up(int group_size, int global_size);
 	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
+	void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
 
 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
 	void shader(DeviceTask& task);
@@ -525,6 +540,42 @@ protected:
 
 	virtual string build_options_for_base_program(
 	        const DeviceRequestedFeatures& /*requested_features*/);
+
+private:
+	MemoryManager memory_manager;
+	friend class MemoryManager;
+
+	struct tex_info_t {
+		uint buffer, padding;
+		cl_ulong offset;
+		uint width, height, depth, options;
+	};
+	static_assert_align(tex_info_t, 16);
+
+	vector<tex_info_t> texture_descriptors;
+	device_memory texture_descriptors_buffer;
+
+	struct Texture {
+		Texture() {}
+		Texture(device_memory* mem,
+		         InterpolationType interpolation,
+		         ExtensionType extension)
+		    : mem(mem),
+			  interpolation(interpolation),
+			  extension(extension) {
+		}
+		device_memory* mem;
+		InterpolationType interpolation;
+		ExtensionType extension;
+	};
+
+	typedef map<string, Texture> TexturesMap;
+	TexturesMap textures;
+
+	bool textures_need_update;
+
+protected:
+	void flush_texture_buffers();
 };
 
 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background);
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index 509da7a0a84..7bdf81462b8 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -29,6 +29,15 @@
 
 CCL_NAMESPACE_BEGIN
 
+struct texture_slot_t {
+	texture_slot_t(const string& name, int slot)
+		: name(name),
+		  slot(slot) {
+	}
+	string name;
+	int slot;
+};
+
 bool OpenCLDeviceBase::opencl_error(cl_int err)
 {
 	if(err != CL_SUCCESS) {
@@ -63,7 +72,7 @@ void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
 }
 
 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
-: Device(info, stats, background_)
+: Device(info, stats, background_), memory_manager(this)
 {
 	cpPlatform = NULL;
 	cdDevice = NULL;
@@ -71,6 +80,7 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cqCommandQueue = NULL;
 	null_mem = 0;
 	device_initialized = false;
+	textures_need_update = true;
 
 	vector<OpenCLPlatformDevice> usable_devices;
 	OpenCLInfo::get_usable_devices(&usable_devices);
@@ -126,6 +136,12 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 		return;
 	}
 
+	/* Allocate this right away so that texture_descriptors_buffer is placed at offset 0 in the device memory buffers */
+	texture_descriptors.resize(1);
+	texture_descriptors_buffer.resize(1);
+	texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0];
+	memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
+
 	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
 }
@@ -134,6 +150,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
 {
 	task_pool.stop();
 
+	memory_manager.free();
+
 	if(null_mem)
 		clReleaseMemObject(CL_MEM_PTR(null_mem));
 
@@ -493,29 +511,35 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 
 void OpenCLDeviceBase::tex_alloc(const char *name,
                device_memory& mem,
-               InterpolationType /*interpolation*/,
-               ExtensionType /*extension*/)
+               InterpolationType interpolation,
+               ExtensionType extension)
 {
 	VLOG(1) << "Texture allocate: " << name << ", "
 	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 	        << string_human_readable_size(mem.memory_size()) << ")";
-	mem_alloc(NULL, mem, MEM_READ_ONLY);
-	mem_copy_to(mem);
-	assert(mem_map.find(name) == mem_map.end());
-	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
+
+	memory_manager.alloc(name, mem);
+	/* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
+	mem.device_pointer = 1;
+	textures[name] = Texture(&mem, interpolation, extension);
+	textures_need_update = true;
 }
 
 void OpenCLDeviceBase::tex_free(device_memory& mem)
 {
 	if(mem.device_pointer) {
-		foreach(const MemMap::value_type& value, mem_map) {
-			if(value.second == mem.device_pointer) {
-				mem_map.erase(value.first);
+		mem.device_pointer = 0;
+
+		if(memory_manager.free(mem)) {
+			textures_need_update = true;
+		}
+
+		foreach(TexturesMap::value_type& value, textures) {
+			if(value.second.mem == &mem) {
+				textures.erase(value.first);
 				break;
 			}
 		}
-
-		mem_free(mem);
 	}
 }
 
@@ -581,6 +605,98 @@ void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const
 	opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
 }
 
+void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
+{
+	flush_texture_buffers();
+
+	memory_manager.set_kernel_arg_buffers(kernel, narg);
+}
+
+void OpenCLDeviceBase::flush_texture_buffers()
+{
+	if(!textures_need_update) {
+		return;
+	}
+	textures_need_update = false;
+
+	/* Setup slots for textures. */
+	int num_slots = 0;
+
+	vector<texture_slot_t> texture_slots;
+
+#define KERNEL_TEX(type, ttype, name) \
+	if(textures.find(#name) != textures.end()) { \
+		texture_slots.push_back(texture_slot_t(#name, num_slots)); \
+	} \
+	num_slots++;
+#include "kernel/kernel_textures.h"
+
+	int num_data_slots = num_slots;
+
+	foreach(TexturesMap::value_type& tex, textures) {
+		string name = tex.first;
+
+		if(string_startswith(name, "__tex_image")) {
+			int pos = name.rfind("_");
+			int id = atoi(name.data() + pos + 1);
+			texture_slots.push_back(texture_slot_t(name,
+				                                   num_data_slots + id));
+			num_slots = max(num_slots, num_data_slots + id + 1);
+		}
+	}
+
+	/* Realloc texture descriptors buffer. */
+	memory_manager.free(texture_descriptors_buffer);
+
+	texture_descriptors.resize(num_slots);
+	texture_descriptors_buffer.resize(num_slots * sizeof(tex_info_t));
+	texture_descriptors_buffer.data_pointer = (device_ptr)&texture_descriptors[0];
+
+	memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
+
+	/* Fill in descriptors */
+	foreach(texture_slot_t& slot, texture_slots) {
+		Texture& tex = textures[slot.name];
+
+		tex_info_t& info = texture_descriptors[slot.slot];
+
+		MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
+
+		info.offset = desc.offset;
+		info.buffer = desc.device_buffer;
+
+		if(string_startswith(slot.name, "__tex_image")) {
+			info.width = tex.mem->data_width;
+			info.height = tex.mem->data_height;
+			info.depth = tex.mem->data_depth;
+
+			info.options = 0;
+
+			if(tex.interpolation == INTERPOLATION_CLOSEST) {
+				info.options |= (1 << 0);
+			}
+
+			switch(tex.extension) {
+				case EXTENSION_REPEAT:
+					info.options |= (1 << 1);
+					break;
+				case EXTENSION_EXTEND:
+					info.options |= (1 << 2);
+					break;
+				case EXTENSION_CLIP:
+					info.options |= (1 << 3);
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
+	/* Force write of descriptors. */
+	memory_manager.free(texture_descriptors_buffer);
+	memory_manager.alloc("texture_descriptors", texture_descriptors_buffer);
+}
+
 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
 {
 	/* cast arguments to cl types */
@@ -605,10 +721,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_
 		                d_rgba,
 		                d_buffer);
 
-#define KERNEL_TEX(type, ttype, name) \
-set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+	set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
 
 	start_arg_index += kernel_set_args(ckFilmConvertKernel,
 	                                   start_arg_index,
@@ -1030,10 +1143,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
 		                                   d_output_luma);
 	}
 
-#define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(kernel, &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+	set_kernel_arg_buffers(kernel, &start_arg_index);
 
 	start_arg_index += kernel_set_args(kernel,
 	                                   start_arg_index,
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 06c15bcf401..ec47fdafa3d 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -82,10 +82,7 @@ public:
 			                d_buffer,
 			                d_rng_state);
 
-#define KERNEL_TEX(type, ttype, name) \
-		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+		set_kernel_arg_buffers(ckPathTraceKernel, &start_arg_index);
 
 		start_arg_index += kernel_set_args(ckPathTraceKernel,
 		                                   start_arg_index,
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 76d9983e9a2..16a96213100 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -99,6 +99,8 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
+		flush_texture_buffers();
+
 		if(task->type == DeviceTask::FILM_CONVERT) {
 			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
 		}
@@ -113,10 +115,19 @@ public:
 			 */
 			typedef struct KernelGlobals {
 				ccl_constant KernelData *data;
+				ccl_global char *buffers[8];
+
+				typedef struct _tex_info_t {
+					uint buffer, padding;
+					uint64_t offset;
+					uint width, height, depth, options;
+				} _tex_info_t;
+
 #define KERNEL_TEX(type, ttype, name) \
-				ccl_global type *name;
+				_tex_info_t name;
 #include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
+
 				SplitData split_data;
 				SplitParams split_param_data;
 			} KernelGlobals;
@@ -217,11 +228,7 @@ public:
 					            *cached_memory.ray_state,
 					            *cached_memory.rng_state);
 
-/* TODO(sergey): Avoid map lookup here. */
-#define KERNEL_TEX(type, ttype, name) \
-				device->set_kernel_arg_mem(program(), &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+				device->set_kernel_arg_buffers(program(), &start_arg_index);
 
 			start_arg_index +=
 				device->kernel_set_args(program(),
@@ -352,11 +359,7 @@ public:
 			                ray_state,
 			                rtile.rng_state);
 
-/* TODO(sergey): Avoid map lookup here. */
-#define KERNEL_TEX(type, ttype, name) \
-	device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
-#include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
+			device->set_kernel_arg_buffers(device->program_data_init(), &start_arg_index);
 
 		start_arg_index +=
 			device->kernel_set_args(device->program_data_init(),
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 0d34af3e040..7d5173a5f1d 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -635,7 +635,7 @@ bool OpenCLInfo::device_supported(const string& platform_name,
 			"Tahiti", "Pitcairn", "Capeverde", "Oland",
 			NULL
 		};
-		for (int i = 0; blacklist[i] != NULL; i++) {
+		for(int i = 0; blacklist[i] != NULL; i++) {
 			if(device_name == blacklist[i]) {
 				VLOG(1) << "AMD device " << device_name << " not supported";
 				return false;
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 23e9bd311c4..b4ca16bdb48 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -79,7 +79,6 @@ set(SRC_HEADERS
 	kernel_compat_cpu.h
 	kernel_compat_cuda.h
 	kernel_compat_opencl.h
-	kernel_debug.h
 	kernel_differential.h
 	kernel_emission.h
 	kernel_film.h
@@ -202,6 +201,7 @@ set(SRC_GEOM_HEADERS
 	geom/geom.h
 	geom/geom_attribute.h
 	geom/geom_curve.h
+	geom/geom_curve_intersect.h
 	geom/geom_motion_curve.h
 	geom/geom_motion_triangle.h
 	geom/geom_motion_triangle_intersect.h
@@ -233,6 +233,7 @@ set(SRC_FILTER_HEADERS
 set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
+	../util/util_defines.h
 	../util/util_half.h
 	../util/util_hash.h
 	../util/util_math.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 85741016b25..cf0c8542d69 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -233,7 +233,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
-                                                     int skip_object,
+                                                     uint visibility,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
@@ -244,7 +244,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 			return bvh_intersect_shadow_all_hair_motion(kg,
 			                                            ray,
 			                                            isect,
-			                                            skip_object,
+			                                            visibility,
 			                                            max_hits,
 			                                            num_hits);
 		}
@@ -253,7 +253,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		return bvh_intersect_shadow_all_motion(kg,
 		                                       ray,
 		                                       isect,
-		                                       skip_object,
+		                                       visibility,
 		                                       max_hits,
 		                                       num_hits);
 	}
@@ -264,7 +264,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		return bvh_intersect_shadow_all_hair(kg,
 		                                     ray,
 		                                     isect,
-		                                     skip_object,
+		                                     visibility,
 		                                     max_hits,
 		                                     num_hits);
 	}
@@ -275,7 +275,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 		return bvh_intersect_shadow_all_instancing(kg,
 		                                           ray,
 		                                           isect,
-		                                           skip_object,
+		                                           visibility,
 		                                           max_hits,
 		                                           num_hits);
 	}
@@ -284,7 +284,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 	return bvh_intersect_shadow_all(kg,
 	                                ray,
 	                                isect,
-	                                skip_object,
+	                                visibility,
 	                                max_hits,
 	                                num_hits);
 }
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 74a9ebf14e4..6c33dad5426 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -52,8 +52,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 	float c0hiy = (node1.z - P.y) * idir.y;
 	float c0loz = (node2.x - P.z) * idir.z;
 	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+	float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+	float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
 
 	float c1lox = (node0.y - P.x) * idir.x;
 	float c1hix = (node0.w - P.x) * idir.x;
@@ -61,8 +61,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 	float c1hiy = (node1.w - P.y) * idir.y;
 	float c1loz = (node2.y - P.z) * idir.z;
 	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+	float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+	float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
 
 	dist[0] = c0min;
 	dist[1] = c1min;
@@ -101,8 +101,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 	float c0hiy = (node1.z - P.y) * idir.y;
 	float c0loz = (node2.x - P.z) * idir.z;
 	float c0hiz = (node2.z - P.z) * idir.z;
-	float c0min = max4(min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz), 0.0f);
-	float c0max = min4(max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz), t);
+	float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+	float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
 
 	float c1lox = (node0.y - P.x) * idir.x;
 	float c1hix = (node0.w - P.x) * idir.x;
@@ -110,8 +110,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
 	float c1hiy = (node1.w - P.y) * idir.y;
 	float c1loz = (node2.y - P.z) * idir.z;
 	float c1hiz = (node2.w - P.z) * idir.z;
-	float c1min = max4(min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz), 0.0f);
-	float c1max = min4(max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz), t);
+	float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+	float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
 
 	if(difl != 0.0f) {
 		float hdiff = 1.0f + difl;
@@ -483,8 +483,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 	ssef tfar_y = max(lower_y, upper_y);
 	ssef tfar_z = max(lower_z, upper_z);
 
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	sseb vmask = tnear <= tfar;
 	dist[0] = tnear.f[0];
 	dist[1] = tnear.f[1];
@@ -545,8 +545,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 	ssef tfar_y = max(lower_y, upper_y);
 	ssef tfar_z = max(lower_z, upper_z);
 
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	sseb vmask;
 	if(difl != 0.0f) {
 		const float round_down = 1.0f - difl;
@@ -615,7 +615,7 @@ ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
                                                      const float3& P,
                                                      const float3& dir,
                                                      const ssef& isect_near,
-                                                      const ssef& isect_far,
+                                                     const ssef& isect_far,
                                                      const ssef& tsplat,
                                                      const ssef Psplat[3],
                                                      const ssef idirsplat[3],
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 267e098f912..a6a4353562c 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -45,7 +45,7 @@ ccl_device_inline
 bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                  const Ray *ray,
                                  Intersection *isect_array,
-                                 const int skip_object,
+                                 const uint visibility,
                                  const uint max_hits,
                                  uint *num_hits)
 {
@@ -119,7 +119,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               idir,
 				                               isect_t,
 				                               node_addr,
-				                               PATH_RAY_SHADOW,
+				                               visibility,
 				                               dist);
 #else // __KERNEL_SSE2__
 				traverse_mask = NODE_INTERSECT(kg,
@@ -134,7 +134,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				                               idirsplat,
 				                               shufflexyz,
 				                               node_addr,
-				                               PATH_RAY_SHADOW,
+				                               visibility,
 				                               dist);
 #endif // __KERNEL_SSE2__
 
@@ -186,17 +186,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					/* primitive intersection */
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-
-#ifdef __SHADOW_TRICKS__
-						uint tri_object = (object == OBJECT_NONE)
-						        ? kernel_tex_fetch(__prim_object, prim_addr)
-						        : object;
-						if(tri_object == skip_object) {
-							++prim_addr;
-							continue;
-						}
-#endif
-
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -209,7 +198,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								                         isect_array,
 								                         P,
 								                         dir,
-								                         PATH_RAY_SHADOW,
+								                         visibility,
 								                         object,
 								                         prim_addr);
 								break;
@@ -221,7 +210,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								                                P,
 								                                dir,
 								                                ray->time,
-								                                PATH_RAY_SHADOW,
+								                                visibility,
 								                                object,
 								                                prim_addr);
 								break;
@@ -232,30 +221,30 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 							case PRIMITIVE_MOTION_CURVE: {
 								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect_array,
-									                                   P,
-									                                   dir,
-									                                   PATH_RAY_SHADOW,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   NULL,
-									                                   0, 0);
+									hit = cardinal_curve_intersect(kg,
+									                               isect_array,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               NULL,
+									                               0, 0);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect_array,
-									                          P,
-									                          dir,
-									                          PATH_RAY_SHADOW,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          NULL,
-									                          0, 0);
+									hit = curve_intersect(kg,
+									                      isect_array,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      NULL,
+									                      0, 0);
 								}
 								break;
 							}
@@ -402,7 +391,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
-                                         const int skip_object,
+                                         const uint visibility,
                                          const uint max_hits,
                                          uint *num_hits)
 {
@@ -411,7 +400,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
 		                                    ray,
 		                                    isect_array,
-		                                    skip_object,
+		                                    visibility,
 		                                    max_hits,
 		                                    num_hits);
 	}
@@ -422,7 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
 		                                   ray,
 		                                   isect_array,
-		                                   skip_object,
+		                                   visibility,
 		                                   max_hits,
 		                                   num_hits);
 	}
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index c58d3b0316c..ae8f54821f2 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -244,14 +244,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								{
 									/* shadow ray early termination */
 #if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #  if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #  endif
 #else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #endif
 								}
@@ -274,14 +274,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								{
 									/* shadow ray early termination */
 #  if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #    if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #    endif
 #  else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #  endif
 								}
@@ -298,44 +298,44 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   lcg_state,
-									                                   difl,
-									                                   extmax);
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          lcg_state,
-									                          difl,
-									                          extmax);
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
 								}
 								if(hit) {
 									/* shadow ray early termination */
 #  if defined(__KERNEL_SSE2__)
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 									tsplat = ssef(0.0f, 0.0f, -isect->t, -isect->t);
 #    if BVH_FEATURE(BVH_HAIR)
 									tfar = ssef(isect->t);
 #    endif
 #  else
-									if(visibility == PATH_RAY_SHADOW_OPAQUE)
+									if(visibility & PATH_RAY_SHADOW_OPAQUE)
 										return true;
 #  endif
 								}
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 6d22f0b0d6a..3036efd4198 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -126,8 +126,8 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg
 	const sseb vmask = cast(tnear) > cast(tfar);
 	int mask = (int)movemask(vmask)^0xf;
 #else
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	const sseb vmask = tnear <= tfar;
 	int mask = (int)movemask(vmask);
 #endif
@@ -174,8 +174,8 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
 
 	const float round_down = 1.0f - difl;
 	const float round_up = 1.0f + difl;
-	const ssef tnear = max4(tnear_x, tnear_y, tnear_z, isect_near);
-	const ssef tfar = min4(tfar_x, tfar_y, tfar_z, isect_far);
+	const ssef tnear = max4(isect_near, tnear_x, tnear_y, tnear_z);
+	const ssef tfar = min4(isect_far, tfar_x, tfar_y, tfar_z);
 	const sseb vmask = round_down*tnear <= round_up*tfar;
 	*dist = tnear;
 	return (int)movemask(vmask);
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index ce474438f2c..522213f30ca 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -33,7 +33,7 @@
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              Intersection *isect_array,
-                                             const int skip_object,
+                                             const uint visibility,
                                              const uint max_hits,
                                              uint *num_hits)
 {
@@ -107,7 +107,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 				if(false
 #ifdef __VISIBILITY_FLAG__
-				   || ((__float_as_uint(inodes.x) & PATH_RAY_SHADOW) == 0)
+				   || ((__float_as_uint(inodes.x) & visibility) == 0)
 #endif
 #if BVH_FEATURE(BVH_MOTION)
 				   || UNLIKELY(ray->time < inodes.y)
@@ -244,7 +244,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			if(node_addr < 0) {
 				float4 leaf = kernel_tex_fetch(__bvh_leaf_nodes, (-node_addr-1));
 #ifdef __VISIBILITY_FLAG__
-				if((__float_as_uint(leaf.z) & PATH_RAY_SHADOW) == 0) {
+				if((__float_as_uint(leaf.z) & visibility) == 0) {
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
 					--stack_ptr;
@@ -268,17 +268,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					/* Primitive intersection. */
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
-
-#ifdef __SHADOW_TRICKS__
-						uint tri_object = (object == OBJECT_NONE)
-						        ? kernel_tex_fetch(__prim_object, prim_addr)
-						        : object;
-						if(tri_object == skip_object) {
-							++prim_addr;
-							continue;
-						}
-#endif
-
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -291,7 +280,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								                         isect_array,
 								                         P,
 								                         dir,
-								                         PATH_RAY_SHADOW,
+								                         visibility,
 								                         object,
 								                         prim_addr);
 								break;
@@ -303,7 +292,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								                                P,
 								                                dir,
 								                                ray->time,
-								                                PATH_RAY_SHADOW,
+								                                visibility,
 								                                object,
 								                                prim_addr);
 								break;
@@ -314,30 +303,30 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 							case PRIMITIVE_MOTION_CURVE: {
 								const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect_array,
-									                                   P,
-									                                   dir,
-									                                   PATH_RAY_SHADOW,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   NULL,
-									                                   0, 0);
+									hit = cardinal_curve_intersect(kg,
+									                               isect_array,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               NULL,
+									                               0, 0);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect_array,
-									                          P,
-									                          dir,
-									                          PATH_RAY_SHADOW,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          NULL,
-									                          0, 0);
+									hit = curve_intersect(kg,
+									                      isect_array,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      NULL,
+									                      0, 0);
 								}
 								break;
 							}
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index fca75a1d416..335a4afd47a 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -340,7 +340,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								                      prim_addr)) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
@@ -362,7 +362,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								                             prim_addr)) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
@@ -379,37 +379,37 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
 								bool hit;
 								if(kernel_data.curve.curveflags & CURVE_KN_INTERPOLATE) {
-									hit = bvh_cardinal_curve_intersect(kg,
-									                                   isect,
-									                                   P,
-									                                   dir,
-									                                   visibility,
-									                                   object,
-									                                   prim_addr,
-									                                   ray->time,
-									                                   curve_type,
-									                                   lcg_state,
-									                                   difl,
-									                                   extmax);
+									hit = cardinal_curve_intersect(kg,
+									                               isect,
+									                               P,
+									                               dir,
+									                               visibility,
+									                               object,
+									                               prim_addr,
+									                               ray->time,
+									                               curve_type,
+									                               lcg_state,
+									                               difl,
+									                               extmax);
 								}
 								else {
-									hit = bvh_curve_intersect(kg,
-									                          isect,
-									                          P,
-									                          dir,
-									                          visibility,
-									                          object,
-									                          prim_addr,
-									                          ray->time,
-									                          curve_type,
-									                          lcg_state,
-									                          difl,
-									                          extmax);
+									hit = curve_intersect(kg,
+									                      isect,
+									                      P,
+									                      dir,
+									                      visibility,
+									                      object,
+									                      prim_addr,
+									                      ray->time,
+									                      curve_type,
+									                      lcg_state,
+									                      difl,
+									                      extmax);
 								}
 								if(hit) {
 									tfar = ssef(isect->t);
 									/* Shadow ray early termination. */
-									if(visibility == PATH_RAY_SHADOW_OPAQUE) {
+									if(visibility & PATH_RAY_SHADOW_OPAQUE) {
 										return true;
 									}
 								}
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 22d0092093a..2f2c35d5d1f 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -40,20 +40,20 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 }
 
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
-ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
+ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float randx, const float randy)
 {
 	if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
-		const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
-		const float phi = M_2PI_F * randU.y;
+		const float r = sqrtf(randx / max(1.0f - randx, 1e-7f));
+		const float phi = M_2PI_F * randy;
 		return make_float2(r*cosf(phi), r*sinf(phi));
 	}
 
-	const float sinI = sqrtf(1.0f - cosI*cosI);
+	const float sinI = safe_sqrtf(1.0f - cosI*cosI);
 	const float tanI = sinI/cosI;
 	const float projA = 0.5f * (cosI + 1.0f);
 	if(projA < 0.0001f)
 		return make_float2(0.0f, 0.0f);
-	const float A = 2.0f*randU.x*projA / cosI - 1.0f;
+	const float A = 2.0f*randx*projA / cosI - 1.0f;
 	float tmp = A*A-1.0f;
 	if(fabsf(tmp) < 1e-7f)
 		return make_float2(0.0f, 0.0f);
@@ -64,24 +64,24 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 ran
 	const float slopeX = (A < 0.0f || slopeX2 > 1.0f/tanI)? (tanI*tmp - D) : slopeX2;
 
 	float U2;
-	if(randU.y >= 0.5f)
-		U2 = 2.0f*(randU.y - 0.5f);
+	if(randy >= 0.5f)
+		U2 = 2.0f*(randy - 0.5f);
 	else
-		U2 = 2.0f*(0.5f - randU.y);
+		U2 = 2.0f*(0.5f - randy);
 	const float z = (U2*(U2*(U2*0.27385f-0.73369f)+0.46341f)) / (U2*(U2*(U2*0.093073f+0.309420f)-1.0f)+0.597999f);
 	const float slopeY = z * sqrtf(1.0f + slopeX*slopeX);
 
-	if(randU.y >= 0.5f)
+	if(randy >= 0.5f)
 		return make_float2(slopeX, slopeY);
 	else
 		return make_float2(slopeX, -slopeY);
 }
 
 /* Visible normal sampling for the GGX distribution (based on page 7 of the supplemental implementation). */
-ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float2 randU)
+ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha, const float randx, const float randy)
 {
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
-	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
+	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randx, randy);
 
 	const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
 	const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
@@ -474,6 +474,7 @@ ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg, const ShaderC
 	*eval *= *pdf;
 
 	*omega_in = X*localO.x + Y*localO.y + Z*localO.z;
+
 #ifdef __RAY_DIFFERENTIALS__
 	*domega_in_dx = (2 * dot(Z, dIdx)) * Z - dIdx;
 	*domega_in_dy = (2 * dot(Z, dIdy)) * Z - dIdy;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index 2eb2457c9e5..e73915dbda7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -100,11 +100,14 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 	bool outside = true;
 
 	for(int order = 0; order < 10; order++) {
-		/* Sample microfacet height and normal */
-		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state)))
+		/* Sample microfacet height. */
+		float height_rand = lcg_step_float_addrspace(lcg_state);
+		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand))
 			break;
-		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
-		                                                   lcg_step_float_addrspace(lcg_state)));
+		/* Sample microfacet normal. */
+		float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+		float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+		float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
 
 #ifdef MF_MULTI_GLASS
 		if(order == 0 && use_fresnel) {
@@ -136,7 +139,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
 #ifdef MF_MULTI_GLASS
 			bool next_outside;
 			float3 wi_prev = -wr;
-			wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+			float phase_rand = lcg_step_float_addrspace(lcg_state);
+			wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
 			if(!next_outside) {
 				outside = !outside;
 				wr = -wr;
@@ -204,14 +208,16 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
 	int order;
 	for(order = 0; order < 10; order++) {
 		/* Sample microfacet height. */
-		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, lcg_step_float_addrspace(lcg_state))) {
+		float height_rand = lcg_step_float_addrspace(lcg_state);
+		if(!mf_sample_height(wr, &hr, &C1_r, &G1_r, &lambda_r, height_rand)) {
 			/* The random walk has left the surface. */
 			*wo = outside? wr: -wr;
 			return throughput;
 		}
 		/* Sample microfacet normal. */
-		float3 wm = mf_sample_vndf(-wr, alpha, make_float2(lcg_step_float_addrspace(lcg_state),
-		                                                   lcg_step_float_addrspace(lcg_state)));
+		float vndf_rand_y = lcg_step_float_addrspace(lcg_state);
+		float vndf_rand_x = lcg_step_float_addrspace(lcg_state);
+		float3 wm = mf_sample_vndf(-wr, alpha, vndf_rand_x, vndf_rand_y);
 
 		/* First-bounce color is already accounted for in mix weight. */
 		if(!use_fresnel && order > 0)
@@ -221,7 +227,8 @@ ccl_device_forceinline float3 MF_FUNCTION_FULL_NAME(mf_sample)(
 #ifdef MF_MULTI_GLASS
 		bool next_outside;
 		float3 wi_prev = -wr;
-		wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, lcg_step_float_addrspace(lcg_state), &next_outside);
+		float phase_rand = lcg_step_float_addrspace(lcg_state);
+		wr = mf_sample_phase_glass(-wr, outside? eta: 1.0f/eta, wm, phase_rand, &next_outside);
 		if(!next_outside) {
 			hr = -hr;
 			wr = -wr;
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index f733ea4c517..267aeea6e86 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -348,8 +348,9 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
 {
 	Bssrdf *bssrdf = (Bssrdf*)closure_alloc(sd, sizeof(Bssrdf), CLOSURE_NONE_ID, weight);
 
-	if(!bssrdf)
+	if(bssrdf == NULL) {
 		return NULL;
+	}
 
 	float sample_weight = fabsf(average(weight));
 	bssrdf->sample_weight = sample_weight;
@@ -399,7 +400,7 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
 			bssrdf_burley_setup(bssrdf);
 		}
 
-		return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
+		return SD_BSSRDF;
 	}
 }
 
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
index 3185330994c..3ddd8712266 100644
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
 
 /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
  * pixel_buffer always points to the first of the 4 current pixel in the first pass.
@@ -24,25 +24,25 @@ CCL_NAMESPACE_BEGIN
 
 #define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
                                  for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-                                     __m128 y4 = _mm_set1_ps(pixel.y); \
+                                     float4 y4 = make_float4(pixel.y); \
                                      for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
-                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+                                         float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
+                                         int4 active_pixels = x4 < make_float4(high.x);
 
 #define END_FOR_PIXEL_WINDOW_SSE     } \
                                      pixel_buffer += buffer_w - (pixel.x - low.x); \
                                  }
 
-ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
-                                               __m128 active_pixels,
+ccl_device_inline void filter_get_features_sse(float4 x, float4 y,
+                                               int4 active_pixels,
                                                const float *ccl_restrict buffer,
-                                               __m128 *features,
-                                               const __m128 *ccl_restrict mean,
+                                               float4 *features,
+                                               const float4 *ccl_restrict mean,
                                                int pass_stride)
 {
 	features[0] = x;
 	features[1] = y;
-	features[2] = _mm_fabs_ps(ccl_get_feature_sse(0));
+	features[2] = fabs(ccl_get_feature_sse(0));
 	features[3] = ccl_get_feature_sse(1);
 	features[4] = ccl_get_feature_sse(2);
 	features[5] = ccl_get_feature_sse(3);
@@ -52,53 +52,41 @@ ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y,
 	features[9] = ccl_get_feature_sse(7);
 	if(mean) {
 		for(int i = 0; i < DENOISE_FEATURES; i++)
-			features[i] = _mm_sub_ps(features[i], mean[i]);
+			features[i] = features[i] - mean[i];
 	}
 	for(int i = 0; i < DENOISE_FEATURES; i++)
-		features[i] = _mm_mask_ps(features[i], active_pixels);
+		features[i] = mask(active_pixels, features[i]);
 }
 
-ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y,
-                                                     __m128 active_pixels,
+ccl_device_inline void filter_get_feature_scales_sse(float4 x, float4 y,
+                                                     int4 active_pixels,
                                                      const float *ccl_restrict buffer,
-                                                     __m128 *scales,
-                                                     const __m128 *ccl_restrict mean,
+                                                     float4 *scales,
+                                                     const float4 *ccl_restrict mean,
                                                      int pass_stride)
 {
-	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
-	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
-
-	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(_mm_fabs_ps(ccl_get_feature_sse(0)), mean[2])), active_pixels);
-
-	__m128 diff, scale;
-	diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
-	scale = _mm_mul_ps(diff, diff);
-	diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	scales[3] = _mm_mask_ps(scale, active_pixels);
-
-	scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
-
-	diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
-	scale = _mm_mul_ps(diff, diff);
-	diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
-	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
-	scales[5] = _mm_mask_ps(scale, active_pixels);
+	scales[0] = fabs(x - mean[0]);
+	scales[1] = fabs(y - mean[1]);
+	scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
+	scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) +
+	            sqr(ccl_get_feature_sse(2) - mean[4]) +
+	            sqr(ccl_get_feature_sse(3) - mean[5]);
+	scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
+	scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) +
+	            sqr(ccl_get_feature_sse(6) - mean[8]) +
+	            sqr(ccl_get_feature_sse(7) - mean[9]);
+	for(int i = 0; i < 6; i++)
+		scales[i] = mask(active_pixels, scales[i]);
 }
 
-ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+ccl_device_inline void filter_calculate_scale_sse(float4 *scale)
 {
-	scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
-	scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
-	scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
-	scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
-
-	scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
-	scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+	scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
+	scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
+	scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
+	scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
+	scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
+	scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
 }
 
 
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
index 3e752bce68f..5e989331bc2 100644
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -50,10 +50,8 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                                               int w,
                                               int f)
 {
-#ifdef __KERNEL_SSE3__
-	int aligned_lowx = (rect.x & ~(3));
-	int aligned_highx = ((rect.z + 3) & ~(3));
-#endif
+	int aligned_lowx = rect.x / 4;
+	int aligned_highx = (rect.z + 3) / 4;
 	for(int y = rect.y; y < rect.w; y++) {
 		const int low = max(rect.y, y-f);
 		const int high = min(rect.w, y+f+1);
@@ -61,15 +59,11 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
 			out_image[y*w+x] = 0.0f;
 		}
 		for(int y1 = low; y1 < high; y1++) {
-#ifdef __KERNEL_SSE3__
-			for(int x = aligned_lowx; x < aligned_highx; x+=4) {
-				_mm_store_ps(out_image + y*w+x, _mm_add_ps(_mm_load_ps(out_image + y*w+x), _mm_load_ps(difference_image + y1*w+x)));
+			float4* out_image4 = (float4*)(out_image + y*w);
+			float4* difference_image4 = (float4*)(difference_image + y1*w);
+			for(int x = aligned_lowx; x < aligned_highx; x++) {
+				out_image4[x] += difference_image4[x];
 			}
-#else
-			for(int x = rect.x; x < rect.z; x++) {
-				out_image[y*w+x] += difference_image[y1*w+x];
-			}
-#endif
 		}
 		for(int x = rect.x; x < rect.z; x++) {
 			out_image[y*w+x] *= 1.0f/(high - low);
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
index d5ae1b73927..2aeb54a62be 100644
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -61,8 +61,8 @@ ccl_device void kernel_filter_divide_shadow(int sample,
 		varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
 		varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
 	}
-	varA /= (odd_sample - 1);
-	varB /= (even_sample - 1);
+	varA /= max(odd_sample - 1, 1);
+	varB /= max(even_sample - 1, 1);
 
 	sampleVariance[idx]  = 0.5f*(varA + varB) / sample;
 	sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
@@ -96,11 +96,17 @@ ccl_device void kernel_filter_get_feature(int sample,
 	int idx = (y-rect.y)*buffer_w + (x - rect.x);
 
 	mean[idx] = center_buffer[m_offset] / sample;
-	if(use_split_variance) {
-		variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+	if(sample > 1) {
+		if(use_split_variance) {
+			variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+		}
+		else {
+			variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+		}
 	}
 	else {
-		variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+		/* Can't compute variance with single sample, just set it very high. */
+		variance[idx] = 1e10f;
 	}
 }
 
@@ -114,49 +120,56 @@ ccl_device void kernel_filter_detect_outliers(int x, int y,
 {
 	int buffer_w = align_up(rect.z - rect.x, 4);
 
-	int n = 0;
-	float values[25];
-	for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
-		for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
-			int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
-			float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
-
-			/* Find the position of L. */
-			int i;
-			for(i = 0; i < n; i++) {
-				if(values[i] > L) break;
-			}
-			/* Make space for L by shifting all following values to the right. */
-			for(int j = n; j > i; j--) {
-				values[j] = values[j-1];
-			}
-			/* Insert L. */
-			values[i] = L;
-			n++;
-		}
-	}
-
 	int idx = (y-rect.y)*buffer_w + (x-rect.x);
-	float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+	float3 color = make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]);
 
-	float ref = 2.0f*values[(int)(n*0.75f)];
 	float fac = 1.0f;
-	if(L > ref) {
-		/* The pixel appears to be an outlier.
-		 * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
-		 * should actually be at the reference value:
-		 * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
-		 * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
-		 */
-		float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
-		if(L - 3*stddev < ref) {
-			/* The pixel is an outlier, so negate the depth value to mark it as one.
-			 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
-			depth[idx] = -depth[idx];
-			fac = ref/L;
-			variance[idx              ] *= fac*fac;
-			variance[idx + pass_stride] *= fac*fac;
-			variance[idx+2*pass_stride] *= fac*fac;
+	if(color.x < 0.0f || color.y < 0.0f || color.z < 0.0f) {
+		depth[idx] = -depth[idx];
+		fac = 0.0f;
+	}
+	else {
+		float L = average(color);
+		int n = 0;
+		float values[25];
+		for(int y1 = max(y-2, rect.y); y1 < min(y+3, rect.w); y1++) {
+			for(int x1 = max(x-2, rect.x); x1 < min(x+3, rect.z); x1++) {
+				int idx = (y1-rect.y)*buffer_w + (x1-rect.x);
+				float L = average(make_float3(image[idx], image[idx+pass_stride], image[idx+2*pass_stride]));
+
+				/* Find the position of L. */
+				int i;
+				for(i = 0; i < n; i++) {
+					if(values[i] > L) break;
+				}
+				/* Make space for L by shifting all following values to the right. */
+				for(int j = n; j > i; j--) {
+					values[j] = values[j-1];
+				}
+				/* Insert L. */
+				values[i] = L;
+				n++;
+			}
+		}
+
+		float ref = 2.0f*values[(int)(n*0.75f)];
+		if(L > ref) {
+			/* The pixel appears to be an outlier.
+			 * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is that the pixel
+			 * should actually be at the reference value:
+			 * If the reference is within the 3-sigma interval, the pixel is assumed to be a statistical outlier.
+			 * Otherwise, it is very unlikely that the pixel should be darker, which indicates a legitimate highlight.
+			 */
+			float stddev = sqrtf(average(make_float3(variance[idx], variance[idx+pass_stride], variance[idx+2*pass_stride])));
+			if(L - 3*stddev < ref) {
+				/* The pixel is an outlier, so negate the depth value to mark it as one.
+				 * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM weights. */
+				depth[idx] = -depth[idx];
+				fac = ref/L;
+				variance[idx              ] *= fac*fac;
+				variance[idx + pass_stride] *= fac*fac;
+				variance[idx+2*pass_stride] *= fac*fac;
+			}
 		}
 	}
 	out[idx              ] = fac*image[idx];
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 30dc2969b11..9e65f61664b 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -24,7 +24,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 {
 	int buffer_w = align_up(rect.z - rect.x, 4);
 
-	__m128 features[DENOISE_FEATURES];
+	float4 features[DENOISE_FEATURES];
 	const float *ccl_restrict pixel_buffer;
 	int2 pixel;
 
@@ -34,19 +34,19 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 	                      min(rect.w, y + radius + 1));
 	int num_pixels = (high.y - low.y) * (high.x - low.x);
 
-	__m128 feature_means[DENOISE_FEATURES];
+	float4 feature_means[DENOISE_FEATURES];
 	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
 		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
 	} END_FOR_PIXEL_WINDOW_SSE
 
-	__m128 pixel_scale = _mm_set1_ps(1.0f / num_pixels);
+	float4 pixel_scale = make_float4(1.0f / num_pixels);
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
-		feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+		feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
 	}
 
-	__m128 feature_scale[DENOISE_FEATURES];
+	float4 feature_scale[DENOISE_FEATURES];
 	math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
@@ -55,12 +55,12 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
 	filter_calculate_scale_sse(feature_scale);
 
-	__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+	float4 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
 	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
 	FOR_PIXEL_WINDOW_SSE {
 		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
 		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
-		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, make_float4(1.0f));
 	} END_FOR_PIXEL_WINDOW_SSE
 
 	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
@@ -98,7 +98,7 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
 
 	/* Bake the feature scaling into the transformation matrix. */
 	for(int i = 0; i < DENOISE_FEATURES; i++) {
-		math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
+		math_vector_scale(transform + i*DENOISE_FEATURES, feature_scale[i][0], *rank);
 	}
 }
 
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index c623e3490fd..f34b77ebc07 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -27,6 +27,7 @@
 #include "kernel/geom/geom_motion_triangle_shader.h"
 #include "kernel/geom/geom_motion_curve.h"
 #include "kernel/geom/geom_curve.h"
+#include "kernel/geom/geom_curve_intersect.h"
 #include "kernel/geom/geom_volume.h"
 #include "kernel/geom/geom_primitive.h"
 
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 5c3b0ee3c15..e35267f02bf 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -16,18 +16,13 @@ CCL_NAMESPACE_BEGIN
 
 /* Curve Primitive
  *
- * Curve primitive for rendering hair and fur. These can be render as flat ribbons
- * or curves with actual thickness. The curve can also be rendered as line segments
- * rather than curves for better performance */
+ * Curve primitive for rendering hair and fur. These can be render as flat
+ * ribbons or curves with actual thickness. The curve can also be rendered as
+ * line segments rather than curves for better performance.
+ */
 
 #ifdef __HAIR__
 
-#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
-#  define ccl_device_curveintersect ccl_device
-#else
-#  define ccl_device_curveintersect ccl_device_forceinline
-#endif
-
 /* Reading attributes on various curve elements */
 
 ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
@@ -151,7 +146,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 /* Curve tangent normal */
 
 ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
-{	
+{
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
 	if(sd->type & PRIMITIVE_ALL_CURVE) {
@@ -219,893 +214,6 @@ ccl_device_inline void curvebounds(float *lower, float *upper, float *extremta,
 	}
 }
 
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
-{
-	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
-}
-#endif
-
-#ifdef __KERNEL_SSE2__
-/* Pass P and dir by reference to aligned vector */
-ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
-#else
-ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
-#endif
-{
-	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
-	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-		if(time < prim_time.x || time > prim_time.y) {
-			return false;
-		}
-	}
-
-	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-	float epsilon = 0.0f;
-	float r_st, r_en;
-
-	int depth = kernel_data.curve.subdivisions;
-	int flags = kernel_data.curve.curveflags;
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-
-#ifdef __KERNEL_SSE2__
-	ssef vdir = load4f(dir);
-	ssef vcurve_coef[4];
-	const float3 *curve_coef = (float3 *)vcurve_coef;
-	
-	{
-		ssef dtmp = vdir * vdir;
-		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
-		ssef rd_ss = load1f_first(1.0f) / d_ss;
-
-		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
-		int2 &v00 = (int2 &)v00vec;
-
-		int k0 = v00.x + segment;
-		int k1 = k0 + 1;
-		int ka = max(k0 - 1, v00.x);
-		int kb = min(k1 + 1, v00.x + v00.y - 1);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
-		avxf P_curve_0_1, P_curve_2_3;
-		if(is_curve_primitive) {
-			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
-			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
-			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
-		}
-#else  /* __KERNEL_AVX2__ */
-		ssef P_curve[4];
-
-		if(is_curve_primitive) {
-			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
-			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
-			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
-			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
-		}
-#endif  /* __KERNEL_AVX2__ */
-
-		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
-		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
-		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
-		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
-		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
-
-		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
-		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
-		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
-		const avxf vPP = _mm256_broadcast_ps(&P.m128);
-		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
-		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
-		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
-
-		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
-		                      htfm00,
-		                      madd(shuffle<1>(P_curve_0_1 - vPP),
-		                           htfm11,
-		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
-		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
-		                      htfm00,
-		                      madd(shuffle<1>(P_curve_2_3 - vPP),
-		                           htfm11,
-		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
-
-		const ssef p0 = _mm256_castps256_ps128(p01);
-		const ssef p1 = _mm256_extractf128_ps(p01, 1);
-		const ssef p2 = _mm256_castps256_ps128(p23);
-		const ssef p3 = _mm256_extractf128_ps(p23, 1);
-
-		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
-		r_st = ((float4 &)P_curve_1).w;
-		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
-		r_en = ((float4 &)P_curve_2).w;
-#else  /* __KERNEL_AVX2__ */
-		ssef htfm[] = { htfm0, htfm1, htfm2 };
-		ssef vP = load4f(P);
-		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
-		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
-		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
-		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
-
-		r_st = ((float4 &)P_curve[1]).w;
-		r_en = ((float4 &)P_curve[2]).w;
-#endif  /* __KERNEL_AVX2__ */
-
-		float fc = 0.71f;
-		ssef vfc = ssef(fc);
-		ssef vfcxp3 = vfc * p3;
-
-		vcurve_coef[0] = p1;
-		vcurve_coef[1] = vfc * (p2 - p0);
-		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
-		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
-
-	}
-#else
-	float3 curve_coef[4];
-
-	/* curve Intersection check */
-	/* obtain curve parameters */
-	{
-		/* ray transform created - this should be created at beginning of intersection loop */
-		Transform htfm;
-		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
-		htfm = make_transform(
-			dir.z / d, 0, -dir.x /d, 0,
-			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
-			dir.x, dir.y, dir.z, 0,
-			0, 0, 0, 1);
-
-		float4 v00 = kernel_tex_fetch(__curves, prim);
-
-		int k0 = __float_as_int(v00.x) + segment;
-		int k1 = k0 + 1;
-
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P_curve[4];
-
-		if(is_curve_primitive) {
-			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-		}
-		else {
-			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
-		}
-
-		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
-		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
-		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
-		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
-
-		float fc = 0.71f;
-		curve_coef[0] = p1;
-		curve_coef[1] = -fc*p0 + fc*p2;
-		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
-		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
-		r_st = P_curve[1].w;
-		r_en = P_curve[2].w;
-	}
-#endif
-
-	float r_curr = max(r_st, r_en);
-
-	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
-		epsilon = 2 * r_curr;
-
-	/* find bounds - this is slow for cubic curves */
-	float upper, lower;
-
-	float zextrem[4];
-	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
-	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
-		return false;
-
-	/* minimum width extension */
-	float mw_extension = min(difl * fabsf(upper), extmax);
-	float r_ext = mw_extension + r_curr;
-
-	float xextrem[4];
-	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	float yextrem[4];
-	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
-	if(lower > r_ext || upper < -r_ext)
-		return false;
-
-	/* setup recurrent loop */
-	int level = 1 << depth;
-	int tree = 0;
-	float resol = 1.0f / (float)level;
-	bool hit = false;
-
-	/* begin loop */
-	while(!(tree >> (depth))) {
-		const float i_st = tree * resol;
-		const float i_en = i_st + (level * resol);
-
-#ifdef __KERNEL_SSE2__
-		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
-		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
-		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
-
-		ssef vbmin = min(vp_st, vp_en);
-		ssef vbmax = max(vp_st, vp_en);
-
-		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
-		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
-		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
-		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
-#else
-		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
-		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
-		
-		float bminx = min(p_st.x, p_en.x);
-		float bmaxx = max(p_st.x, p_en.x);
-		float bminy = min(p_st.y, p_en.y);
-		float bmaxy = max(p_st.y, p_en.y);
-		float bminz = min(p_st.z, p_en.z);
-		float bmaxz = max(p_st.z, p_en.z);
-#endif
-
-		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
-			bminx = min(bminx,xextrem[1]);
-			bmaxx = max(bmaxx,xextrem[1]);
-		}
-		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
-			bminx = min(bminx,xextrem[3]);
-			bmaxx = max(bmaxx,xextrem[3]);
-		}
-		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
-			bminy = min(bminy,yextrem[1]);
-			bmaxy = max(bmaxy,yextrem[1]);
-		}
-		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
-			bminy = min(bminy,yextrem[3]);
-			bmaxy = max(bmaxy,yextrem[3]);
-		}
-		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
-			bminz = min(bminz,zextrem[1]);
-			bmaxz = max(bmaxz,zextrem[1]);
-		}
-		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
-			bminz = min(bminz,zextrem[3]);
-			bmaxz = max(bmaxz,zextrem[3]);
-		}
-
-		float r1 = r_st + (r_en - r_st) * i_st;
-		float r2 = r_st + (r_en - r_st) * i_en;
-		r_curr = max(r1, r2);
-
-		mw_extension = min(difl * fabsf(bmaxz), extmax);
-		float r_ext = mw_extension + r_curr;
-		float coverage = 1.0f;
-
-		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
-			/* the bounding box does not overlap the square centered at O */
-			tree += level;
-			level = tree & -tree;
-		}
-		else if(level == 1) {
-
-			/* the maximum recursion depth is reached.
-			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
-			 * dP* is reversed if necessary.*/
-			float t = isect->t;
-			float u = 0.0f;
-			float gd = 0.0f;
-
-			if(flags & CURVE_KN_RIBBONS) {
-				float3 tg = (p_en - p_st);
-#ifdef __KERNEL_SSE__
-				const float3 tg_sq = tg * tg;
-				float w = tg_sq.x + tg_sq.y;
-#else
-				float w = tg.x * tg.x + tg.y * tg.y;
-#endif
-				if(w == 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-#ifdef __KERNEL_SSE__
-				const float3 p_sttg = p_st * tg;
-				w = -(p_sttg.x + p_sttg.y) / w;
-#else
-				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
-#endif
-				w = saturate(w);
-
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-				r_curr = r_st + (r_en - r_st) * u;
-				/* compare x-y distances */
-				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if(dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if(dot(tg, dp_en) < 0)
-					dp_en *= -1;
-				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				/* compute coverage */
-				float r_ext = r_curr;
-				coverage = 1.0f;
-				if(difl != 0.0f) {
-					mw_extension = min(difl * fabsf(bmaxz), extmax);
-					r_ext = mw_extension + r_curr;
-#ifdef __KERNEL_SSE__
-					const float3 p_curr_sq = p_curr * p_curr;
-					const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
-					float d = dxxx.x;
-#else
-					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
-#endif
-					float d0 = d - r_curr;
-					float d1 = d + r_curr;
-					float inv_mw_extension = 1.0f/mw_extension;
-					if(d0 >= 0)
-						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
-					else // inside
-						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
-				}
-				
-				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				t = p_curr.z;
-
-				/* stochastic fade from minimum width */
-				if(difl != 0.0f && lcg_state) {
-					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-						return hit;
-				}
-			}
-			else {
-				float l = len(p_en - p_st);
-				/* minimum width extension */
-				float or1 = r1;
-				float or2 = r2;
-
-				if(difl != 0.0f) {
-					mw_extension = min(len(p_st - P) * difl, extmax);
-					or1 = r1 < mw_extension ? mw_extension : r1;
-					mw_extension = min(len(p_en - P) * difl, extmax);
-					or2 = r2 < mw_extension ? mw_extension : r2;
-				}
-				/* --- */
-				float invl = 1.0f/l;
-				float3 tg = (p_en - p_st) * invl;
-				gd = (or2 - or1) * invl;
-				float difz = -dot(p_st,tg);
-				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
-				float invcyla = 1.0f/cyla;
-				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
-				float tcentre = -halfb*invcyla;
-				float zcentre = difz + (tg.z * tcentre);
-				float3 tdif = - p_st;
-				tdif.z += tcentre;
-				float tdifz = dot(tdif,tg);
-				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
-				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
-				float td = tb*tb - 4*cyla*tc;
-				if(td < 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-				
-				float rootd = sqrtf(td);
-				float correction = (-tb - rootd) * 0.5f * invcyla;
-				t = tcentre + correction;
-
-				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
-				if(dot(tg, dp_st)< 0)
-					dp_st *= -1;
-				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
-				if(dot(tg, dp_en) < 0)
-					dp_en *= -1;
-
-				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
-					correction = (-tb + rootd) * 0.5f * invcyla;
-					t = tcentre + correction;
-				}			
-
-				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
-					tree++;
-					level = tree & -tree;
-					continue;
-				}
-
-				float w = (zcentre + (tg.z * correction)) * invl;
-				w = saturate(w);
-				/* compute u on the curve segment */
-				u = i_st * (1 - w) + i_en * w;
-
-				/* stochastic fade from minimum width */
-				if(difl != 0.0f && lcg_state) {
-					r_curr = r1 + (r2 - r1) * w;
-					r_ext = or1 + (or2 - or1) * w;
-					coverage = r_curr/r_ext;
-
-					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
-						return hit;
-				}
-			}
-			/* we found a new intersection */
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->t = t;
-				isect->u = u;
-				isect->v = gd;
-				isect->prim = curveAddr;
-				isect->object = object;
-				isect->type = type;
-				hit = true;
-			}
-			
-			tree++;
-			level = tree & -tree;
-		}
-		else {
-			/* split the curve into two curves and process */
-			level = level >> 1;
-		}
-	}
-
-	return hit;
-}
-
-ccl_device_curveintersect bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
-	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
-{
-	/* define few macros to minimize code duplication for SSE */
-#ifndef __KERNEL_SSE2__
-#  define len3_squared(x) len_squared(x)
-#  define len3(x) len(x)
-#  define dot3(x, y) dot(x, y)
-#endif
-
-	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
-
-	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
-		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
-		if(time < prim_time.x || time > prim_time.y) {
-			return false;
-		}
-	}
-
-	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
-	/* curve Intersection check */
-	int flags = kernel_data.curve.curveflags;
-
-	int prim = kernel_tex_fetch(__prim_index, curveAddr);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int cnum = __float_as_int(v00.x);
-	int k0 = cnum + segment;
-	int k1 = k0 + 1;
-
-#ifndef __KERNEL_SSE2__
-	float4 P_curve[2];
-
-	if(is_curve_primitive) {
-		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
-		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
-	}
-	else {
-		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
-	}
-
-	float or1 = P_curve[0].w;
-	float or2 = P_curve[1].w;
-	float3 p1 = float4_to_float3(P_curve[0]);
-	float3 p2 = float4_to_float3(P_curve[1]);
-
-	/* minimum width extension */
-	float r1 = or1;
-	float r2 = or2;
-	float3 dif = P - p1;
-	float3 dif_second = P - p2;
-	if(difl != 0.0f) {
-		float pixelsize = min(len3(dif) * difl, extmax);
-		r1 = or1 < pixelsize ? pixelsize : or1;
-		pixelsize = min(len3(dif_second) * difl, extmax);
-		r2 = or2 < pixelsize ? pixelsize : or2;
-	}
-	/* --- */
-
-	float3 p21_diff = p2 - p1;
-	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
-	float3 dir = direction;
-	float sphere_b_tmp = dot3(dir, sphere_dif1);
-	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
-#else
-	ssef P_curve[2];
-	
-	if(is_curve_primitive) {
-		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
-		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
-	}
-	else {
-		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
-		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
-	}
-
-	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
-
-	ssef r12 = or12;
-	const ssef vP = load4f(P);
-	const ssef dif = vP - P_curve[0];
-	const ssef dif_second = vP - P_curve[1];
-	if(difl != 0.0f) {
-		const ssef len1_sq = len3_squared_splat(dif);
-		const ssef len2_sq = len3_squared_splat(dif_second);
-		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
-		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
-		r12 = max(or12, pixelsize12);
-	}
-	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
-	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
-
-	const ssef p21_diff = P_curve[1] - P_curve[0];
-	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
-	const ssef dir = load4f(direction);
-	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
-	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
-#endif
-
-	float mr = max(r1, r2);
-	float l = len3(p21_diff);
-	float invl = 1.0f / l;
-	float sp_r = mr + 0.5f * l;
-
-	float sphere_b = dot3(dir, sphere_dif2);
-	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
-
-	if(sdisc < 0.0f)
-		return false;
-
-	/* obtain parameters and test midpoint distance for suitable modes */
-#ifndef __KERNEL_SSE2__
-	float3 tg = p21_diff * invl;
-#else
-	const ssef tg = p21_diff * invl;
-#endif
-	float gd = (r2 - r1) * invl;
-
-	float dirz = dot3(dir, tg);
-	float difz = dot3(dif, tg);
-
-	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
-
-	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
-
-	float tcentre = -halfb/a;
-	float zcentre = difz + (dirz * tcentre);
-
-	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
-		return false;
-	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
-		return false;
-
-	/* test minimum separation */
-#ifndef __KERNEL_SSE2__
-	float3 cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross(tg, dif));
-#else
-	const ssef cprod = cross(tg, dir);
-	float cprod2sq = len3_squared(cross_zxy(tg, dif));
-#endif
-	float cprodsq = len3_squared(cprod);
-	float distscaled = dot3(cprod, dif);
-
-	if(cprodsq == 0)
-		distscaled = cprod2sq;
-	else
-		distscaled = (distscaled*distscaled)/cprodsq;
-
-	if(distscaled > mr*mr)
-		return false;
-
-	/* calculate true intersection */
-#ifndef __KERNEL_SSE2__
-	float3 tdif = dif + tcentre * dir;
-#else
-	const ssef tdif = madd(ssef(tcentre), dir, dif);
-#endif
-	float tdifz = dot3(tdif, tg);
-	float tdifma = tdifz*gd + r1;
-	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
-	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
-	float td = tb*tb - 4*a*tc;
-
-	if(td < 0.0f)
-		return false;
-
-	float rootd = 0.0f;
-	float correction = 0.0f;
-	if(flags & CURVE_KN_ACCURATE) {
-		rootd = sqrtf(td);
-		correction = ((-tb - rootd)/(2*a));
-	}
-
-	float t = tcentre + correction;
-
-	if(t < isect->t) {
-
-		if(flags & CURVE_KN_INTERSECTCORRECTION) {
-			rootd = sqrtf(td);
-			correction = ((-tb - rootd)/(2*a));
-			t = tcentre + correction;
-		}
-
-		float z = zcentre + (dirz * correction);
-		// bool backface = false;
-
-		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
-			// backface = true;
-			correction = ((-tb + rootd)/(2*a));
-			t = tcentre + correction;
-			z = zcentre + (dirz * correction);
-		}
-
-		/* stochastic fade from minimum width */
-		float adjradius = or1 + z * (or2 - or1) * invl;
-		adjradius = adjradius / (r1 + z * gd);
-		if(lcg_state && adjradius != 1.0f) {
-			if(lcg_step_float(lcg_state) > adjradius)
-				return false;
-		}
-		/* --- */
-
-		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
-
-			if(flags & CURVE_KN_ENCLOSEFILTER) {
-				float enc_ratio = 1.01f;
-				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
-					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
-					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
-					if(a2*c2 < 0.0f)
-						return false;
-				}
-			}
-
-#ifdef __VISIBILITY_FLAG__
-			/* visibility flag test. we do it here under the assumption
-			 * that most triangles are culled by node flags */
-			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
-#endif
-			{
-				/* record intersection */
-				isect->t = t;
-				isect->u = z*invl;
-				isect->v = gd;
-				isect->prim = curveAddr;
-				isect->object = object;
-				isect->type = type;
-
-				return true;
-			}
-		}
-	}
-
-	return false;
-
-#ifndef __KERNEL_SSE2__
-#  undef len3_squared
-#  undef len3
-#  undef dot3
-#endif
-}
-
-ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float fc = 0.71f;
-	float data[4];
-	float t2 = t * t;
-	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
-	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
-	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
-	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
-{
-	float data[4];
-	float fc = 0.71f;
-	float t2 = t * t;
-	float t3 = t2 * t;
-	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
-	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
-	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
-	data[3] =  fc          * t3  - fc * t2;
-	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
-}
-
-ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray)
-{
-	int flag = kernel_data.curve.curveflags;
-	float t = isect->t;
-	float3 P = ray->P;
-	float3 D = ray->D;
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_itfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-		D = transform_direction(&tfm, D*t);
-		D = normalize_len(D, &t);
-	}
-
-	int prim = kernel_tex_fetch(__prim_index, isect->prim);
-	float4 v00 = kernel_tex_fetch(__curves, prim);
-
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
-	int k1 = k0 + 1;
-
-	float3 tg;
-
-	if(flag & CURVE_KN_INTERPOLATE) {
-		int ka = max(k0 - 1,__float_as_int(v00.x));
-		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
-
-		float4 P_curve[4];
-
-		if(sd->type & PRIMITIVE_CURVE) {
-			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
-			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
-			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
-			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
-		}
-		else {
-			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
-		}
-
-		float3 p[4];
-		p[0] = float4_to_float3(P_curve[0]);
-		p[1] = float4_to_float3(P_curve[1]);
-		p[2] = float4_to_float3(P_curve[2]);
-		p[3] = float4_to_float3(P_curve[3]);
-
-		P = P + D*t;
-
-#ifdef __UV__
-		sd->u = isect->u;
-		sd->v = 0.0f;
-#endif
-
-		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
-
-		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
-		}
-		else {
-			/* direction from inside to surface of curve */
-			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			sd->Ng = normalize(P - p_curr);
-
-			/* adjustment for changing radius */
-			float gd = isect->v;
-
-			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
-			}
-		}
-
-		/* todo: sometimes the normal is still so that this is detected as
-		 * backfacing even if cull backfaces is enabled */
-
-		sd->N = sd->Ng;
-	}
-	else {
-		float4 P_curve[2];
-
-		if(sd->type & PRIMITIVE_CURVE) {
-			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
-			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
-		}
-		else {
-			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
-		}
-
-		float l = 1.0f;
-		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
-		
-		P = P + D*t;
-
-		float3 dif = P - float4_to_float3(P_curve[0]);
-
-#ifdef __UV__
-		sd->u = dot(dif,tg)/l;
-		sd->v = 0.0f;
-#endif
-
-		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			sd->Ng = -(D - tg * dot(tg, D));
-			sd->Ng = normalize(sd->Ng);
-		}
-		else {
-			float gd = isect->v;
-
-			/* direction from inside to surface of curve */
-			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
-
-			/* adjustment for changing radius */
-			if(gd != 0.0f) {
-				sd->Ng = sd->Ng - gd * tg;
-				sd->Ng = normalize(sd->Ng);
-			}
-		}
-
-		sd->N = sd->Ng;
-	}
-
-#ifdef __DPDU__
-	/* dPdu/dPdv */
-	sd->dPdu = tg;
-	sd->dPdv = cross(tg, sd->Ng);
-#endif
-
-	if(isect->object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-		Transform tfm = sd->ob_tfm;
-#else
-		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#endif
-
-		P = transform_point(&tfm, P);
-	}
-
-	return P;
-}
-
-#endif
+#endif  /* __HAIR__ */
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
new file mode 100644
index 00000000000..e9a149ea1ab
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -0,0 +1,934 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Curve primitive intersection functions. */
+
+#ifdef __HAIR__
+
+#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
+#  define ccl_device_curveintersect ccl_device
+#else
+#  define ccl_device_curveintersect ccl_device_forceinline
+#endif
+
+#ifdef __KERNEL_SSE2__
+ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
+{
+	return madd(shuffle<0>(a), t[0], madd(shuffle<1>(a), t[1], shuffle<2>(a) * t[2]));
+}
+#endif
+
+/* On CPU pass P and dir by reference to aligned vector. */
+ccl_device_curveintersect bool cardinal_curve_intersect(
+        KernelGlobals *kg,
+        Intersection *isect,
+        const float3 ccl_ref P,
+        const float3 ccl_ref dir,
+        uint visibility,
+        int object,
+        int curveAddr,
+        float time,
+        int type,
+        uint *lcg_state,
+        float difl,
+        float extmax)
+{
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	float epsilon = 0.0f;
+	float r_st, r_en;
+
+	int depth = kernel_data.curve.subdivisions;
+	int flags = kernel_data.curve.curveflags;
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+
+#ifdef __KERNEL_SSE2__
+	ssef vdir = load4f(dir);
+	ssef vcurve_coef[4];
+	const float3 *curve_coef = (float3 *)vcurve_coef;
+
+	{
+		ssef dtmp = vdir * vdir;
+		ssef d_ss = mm_sqrt(dtmp + shuffle<2>(dtmp));
+		ssef rd_ss = load1f_first(1.0f) / d_ss;
+
+		ssei v00vec = load4i((ssei *)&kg->__curves.data[prim]);
+		int2 &v00 = (int2 &)v00vec;
+
+		int k0 = v00.x + segment;
+		int k1 = k0 + 1;
+		int ka = max(k0 - 1, v00.x);
+		int kb = min(k1 + 1, v00.x + v00.y - 1);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+		avxf P_curve_0_1, P_curve_2_3;
+		if(is_curve_primitive) {
+			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
+			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, curveAddr) : object;
+			motion_cardinal_curve_keys_avx(kg, fobject, prim, time, ka, k0, k1, kb, &P_curve_0_1,&P_curve_2_3);
+		}
+#else  /* __KERNEL_AVX2__ */
+		ssef P_curve[4];
+
+		if(is_curve_primitive) {
+			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
+			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
+			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
+			P_curve[3] = load4f(&kg->__curve_keys.data[kb].x);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, (float4*)&P_curve);
+		}
+#endif  /* __KERNEL_AVX2__ */
+
+		ssef rd_sgn = set_sign_bit<0, 1, 1, 1>(shuffle<0>(rd_ss));
+		ssef mul_zxxy = shuffle<2, 0, 0, 1>(vdir) * rd_sgn;
+		ssef mul_yz = shuffle<1, 2, 1, 2>(vdir) * mul_zxxy;
+		ssef mul_shuf = shuffle<0, 1, 2, 3>(mul_zxxy, mul_yz);
+		ssef vdir0 = vdir & cast(ssei(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
+
+		ssef htfm0 = shuffle<0, 2, 0, 3>(mul_shuf, vdir0);
+		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
+		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
+
+#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
+		const avxf vPP = _mm256_broadcast_ps(&P.m128);
+		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
+		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
+		const avxf htfm22 = avxf(htfm2.m128, htfm2.m128);
+
+		const avxf p01 = madd(shuffle<0>(P_curve_0_1 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_0_1 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_0_1 - vPP) * htfm22));
+		const avxf p23 = madd(shuffle<0>(P_curve_2_3 - vPP),
+		                      htfm00,
+		                      madd(shuffle<1>(P_curve_2_3 - vPP),
+		                           htfm11,
+		                           shuffle<2>(P_curve_2_3 - vPP)*htfm22));
+
+		const ssef p0 = _mm256_castps256_ps128(p01);
+		const ssef p1 = _mm256_extractf128_ps(p01, 1);
+		const ssef p2 = _mm256_castps256_ps128(p23);
+		const ssef p3 = _mm256_extractf128_ps(p23, 1);
+
+		const ssef P_curve_1 = _mm256_extractf128_ps(P_curve_0_1, 1);
+		r_st = ((float4 &)P_curve_1).w;
+		const ssef P_curve_2 = _mm256_castps256_ps128(P_curve_2_3);
+		r_en = ((float4 &)P_curve_2).w;
+#else  /* __KERNEL_AVX2__ */
+		ssef htfm[] = { htfm0, htfm1, htfm2 };
+		ssef vP = load4f(P);
+		ssef p0 = transform_point_T3(htfm, P_curve[0] - vP);
+		ssef p1 = transform_point_T3(htfm, P_curve[1] - vP);
+		ssef p2 = transform_point_T3(htfm, P_curve[2] - vP);
+		ssef p3 = transform_point_T3(htfm, P_curve[3] - vP);
+
+		r_st = ((float4 &)P_curve[1]).w;
+		r_en = ((float4 &)P_curve[2]).w;
+#endif  /* __KERNEL_AVX2__ */
+
+		float fc = 0.71f;
+		ssef vfc = ssef(fc);
+		ssef vfcxp3 = vfc * p3;
+
+		vcurve_coef[0] = p1;
+		vcurve_coef[1] = vfc * (p2 - p0);
+		vcurve_coef[2] = madd(ssef(fc * 2.0f), p0, madd(ssef(fc - 3.0f), p1, msub(ssef(3.0f - 2.0f * fc), p2, vfcxp3)));
+		vcurve_coef[3] = msub(ssef(fc - 2.0f), p2 - p1, msub(vfc, p0, vfcxp3));
+
+	}
+#else
+	float3 curve_coef[4];
+
+	/* curve Intersection check */
+	/* obtain curve parameters */
+	{
+		/* ray transform created - this should be created at beginning of intersection loop */
+		Transform htfm;
+		float d = sqrtf(dir.x * dir.x + dir.z * dir.z);
+		htfm = make_transform(
+			dir.z / d, 0, -dir.x /d, 0,
+			-dir.x * dir.y /d, d, -dir.y * dir.z /d, 0,
+			dir.x, dir.y, dir.z, 0,
+			0, 0, 0, 1);
+
+		float4 v00 = kernel_tex_fetch(__curves, prim);
+
+		int k0 = __float_as_int(v00.x) + segment;
+		int k1 = k0 + 1;
+
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(is_curve_primitive) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+			motion_cardinal_curve_keys(kg, fobject, prim, time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p0 = transform_point(&htfm, float4_to_float3(P_curve[0]) - P);
+		float3 p1 = transform_point(&htfm, float4_to_float3(P_curve[1]) - P);
+		float3 p2 = transform_point(&htfm, float4_to_float3(P_curve[2]) - P);
+		float3 p3 = transform_point(&htfm, float4_to_float3(P_curve[3]) - P);
+
+		float fc = 0.71f;
+		curve_coef[0] = p1;
+		curve_coef[1] = -fc*p0 + fc*p2;
+		curve_coef[2] = 2.0f * fc * p0 + (fc - 3.0f) * p1 + (3.0f - 2.0f * fc) * p2 - fc * p3;
+		curve_coef[3] = -fc * p0 + (2.0f - fc) * p1 + (fc - 2.0f) * p2 + fc * p3;
+		r_st = P_curve[1].w;
+		r_en = P_curve[2].w;
+	}
+#endif
+
+	float r_curr = max(r_st, r_en);
+
+	if((flags & CURVE_KN_RIBBONS) || !(flags & CURVE_KN_BACKFACING))
+		epsilon = 2 * r_curr;
+
+	/* find bounds - this is slow for cubic curves */
+	float upper, lower;
+
+	float zextrem[4];
+	curvebounds(&lower, &upper, &zextrem[0], &zextrem[1], &zextrem[2], &zextrem[3], curve_coef[0].z, curve_coef[1].z, curve_coef[2].z, curve_coef[3].z);
+	if(lower - r_curr > isect->t || upper + r_curr < epsilon)
+		return false;
+
+	/* minimum width extension */
+	float mw_extension = min(difl * fabsf(upper), extmax);
+	float r_ext = mw_extension + r_curr;
+
+	float xextrem[4];
+	curvebounds(&lower, &upper, &xextrem[0], &xextrem[1], &xextrem[2], &xextrem[3], curve_coef[0].x, curve_coef[1].x, curve_coef[2].x, curve_coef[3].x);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	float yextrem[4];
+	curvebounds(&lower, &upper, &yextrem[0], &yextrem[1], &yextrem[2], &yextrem[3], curve_coef[0].y, curve_coef[1].y, curve_coef[2].y, curve_coef[3].y);
+	if(lower > r_ext || upper < -r_ext)
+		return false;
+
+	/* setup recurrent loop */
+	int level = 1 << depth;
+	int tree = 0;
+	float resol = 1.0f / (float)level;
+	bool hit = false;
+
+	/* begin loop */
+	while(!(tree >> (depth))) {
+		const float i_st = tree * resol;
+		const float i_en = i_st + (level * resol);
+
+#ifdef __KERNEL_SSE2__
+		ssef vi_st = ssef(i_st), vi_en = ssef(i_en);
+		ssef vp_st = madd(madd(madd(vcurve_coef[3], vi_st, vcurve_coef[2]), vi_st, vcurve_coef[1]), vi_st, vcurve_coef[0]);
+		ssef vp_en = madd(madd(madd(vcurve_coef[3], vi_en, vcurve_coef[2]), vi_en, vcurve_coef[1]), vi_en, vcurve_coef[0]);
+
+		ssef vbmin = min(vp_st, vp_en);
+		ssef vbmax = max(vp_st, vp_en);
+
+		float3 &bmin = (float3 &)vbmin, &bmax = (float3 &)vbmax;
+		float &bminx = bmin.x, &bminy = bmin.y, &bminz = bmin.z;
+		float &bmaxx = bmax.x, &bmaxy = bmax.y, &bmaxz = bmax.z;
+		float3 &p_st = (float3 &)vp_st, &p_en = (float3 &)vp_en;
+#else
+		float3 p_st = ((curve_coef[3] * i_st + curve_coef[2]) * i_st + curve_coef[1]) * i_st + curve_coef[0];
+		float3 p_en = ((curve_coef[3] * i_en + curve_coef[2]) * i_en + curve_coef[1]) * i_en + curve_coef[0];
+
+		float bminx = min(p_st.x, p_en.x);
+		float bmaxx = max(p_st.x, p_en.x);
+		float bminy = min(p_st.y, p_en.y);
+		float bmaxy = max(p_st.y, p_en.y);
+		float bminz = min(p_st.z, p_en.z);
+		float bmaxz = max(p_st.z, p_en.z);
+#endif
+
+		if(xextrem[0] >= i_st && xextrem[0] <= i_en) {
+			bminx = min(bminx,xextrem[1]);
+			bmaxx = max(bmaxx,xextrem[1]);
+		}
+		if(xextrem[2] >= i_st && xextrem[2] <= i_en) {
+			bminx = min(bminx,xextrem[3]);
+			bmaxx = max(bmaxx,xextrem[3]);
+		}
+		if(yextrem[0] >= i_st && yextrem[0] <= i_en) {
+			bminy = min(bminy,yextrem[1]);
+			bmaxy = max(bmaxy,yextrem[1]);
+		}
+		if(yextrem[2] >= i_st && yextrem[2] <= i_en) {
+			bminy = min(bminy,yextrem[3]);
+			bmaxy = max(bmaxy,yextrem[3]);
+		}
+		if(zextrem[0] >= i_st && zextrem[0] <= i_en) {
+			bminz = min(bminz,zextrem[1]);
+			bmaxz = max(bmaxz,zextrem[1]);
+		}
+		if(zextrem[2] >= i_st && zextrem[2] <= i_en) {
+			bminz = min(bminz,zextrem[3]);
+			bmaxz = max(bmaxz,zextrem[3]);
+		}
+
+		float r1 = r_st + (r_en - r_st) * i_st;
+		float r2 = r_st + (r_en - r_st) * i_en;
+		r_curr = max(r1, r2);
+
+		mw_extension = min(difl * fabsf(bmaxz), extmax);
+		float r_ext = mw_extension + r_curr;
+		float coverage = 1.0f;
+
+		if(bminz - r_curr > isect->t || bmaxz + r_curr < epsilon || bminx > r_ext|| bmaxx < -r_ext|| bminy > r_ext|| bmaxy < -r_ext) {
+			/* the bounding box does not overlap the square centered at O */
+			tree += level;
+			level = tree & -tree;
+		}
+		else if(level == 1) {
+
+			/* the maximum recursion depth is reached.
+			 * check if dP0.(Q-P0)>=0 and dPn.(Pn-Q)>=0.
+			 * dP* is reversed if necessary.*/
+			float t = isect->t;
+			float u = 0.0f;
+			float gd = 0.0f;
+
+			if(flags & CURVE_KN_RIBBONS) {
+				float3 tg = (p_en - p_st);
+#ifdef __KERNEL_SSE__
+				const float3 tg_sq = tg * tg;
+				float w = tg_sq.x + tg_sq.y;
+#else
+				float w = tg.x * tg.x + tg.y * tg.y;
+#endif
+				if(w == 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+#ifdef __KERNEL_SSE__
+				const float3 p_sttg = p_st * tg;
+				w = -(p_sttg.x + p_sttg.y) / w;
+#else
+				w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
+#endif
+				w = saturate(w);
+
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+				r_curr = r_st + (r_en - r_st) * u;
+				/* compare x-y distances */
+				float3 p_curr = ((curve_coef[3] * u + curve_coef[2]) * u + curve_coef[1]) * u + curve_coef[0];
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if(dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				if(dot(dp_st, -p_st) + p_curr.z * dp_st.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if(dot(tg, dp_en) < 0)
+					dp_en *= -1;
+				if(dot(dp_en, p_en) - p_curr.z * dp_en.z < 0) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				/* compute coverage */
+				float r_ext = r_curr;
+				coverage = 1.0f;
+				if(difl != 0.0f) {
+					mw_extension = min(difl * fabsf(bmaxz), extmax);
+					r_ext = mw_extension + r_curr;
+#ifdef __KERNEL_SSE__
+					const float3 p_curr_sq = p_curr * p_curr;
+					const float3 dxxx(_mm_sqrt_ss(_mm_hadd_ps(p_curr_sq.m128, p_curr_sq.m128)));
+					float d = dxxx.x;
+#else
+					float d = sqrtf(p_curr.x * p_curr.x + p_curr.y * p_curr.y);
+#endif
+					float d0 = d - r_curr;
+					float d1 = d + r_curr;
+					float inv_mw_extension = 1.0f/mw_extension;
+					if(d0 >= 0)
+						coverage = (min(d1 * inv_mw_extension, 1.0f) - min(d0 * inv_mw_extension, 1.0f)) * 0.5f;
+					else // inside
+						coverage = (min(d1 * inv_mw_extension, 1.0f) + min(-d0 * inv_mw_extension, 1.0f)) * 0.5f;
+				}
+
+				if(p_curr.x * p_curr.x + p_curr.y * p_curr.y >= r_ext * r_ext || p_curr.z <= epsilon || isect->t < p_curr.z) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				t = p_curr.z;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			else {
+				float l = len(p_en - p_st);
+				/* minimum width extension */
+				float or1 = r1;
+				float or2 = r2;
+
+				if(difl != 0.0f) {
+					mw_extension = min(len(p_st - P) * difl, extmax);
+					or1 = r1 < mw_extension ? mw_extension : r1;
+					mw_extension = min(len(p_en - P) * difl, extmax);
+					or2 = r2 < mw_extension ? mw_extension : r2;
+				}
+				/* --- */
+				float invl = 1.0f/l;
+				float3 tg = (p_en - p_st) * invl;
+				gd = (or2 - or1) * invl;
+				float difz = -dot(p_st,tg);
+				float cyla = 1.0f - (tg.z * tg.z * (1 + gd*gd));
+				float invcyla = 1.0f/cyla;
+				float halfb = (-p_st.z - tg.z*(difz + gd*(difz*gd + or1)));
+				float tcentre = -halfb*invcyla;
+				float zcentre = difz + (tg.z * tcentre);
+				float3 tdif = - p_st;
+				tdif.z += tcentre;
+				float tdifz = dot(tdif,tg);
+				float tb = 2*(tdif.z - tg.z*(tdifz + gd*(tdifz*gd + or1)));
+				float tc = dot(tdif,tdif) - tdifz * tdifz * (1 + gd*gd) - or1*or1 - 2*or1*tdifz*gd;
+				float td = tb*tb - 4*cyla*tc;
+				if(td < 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float rootd = sqrtf(td);
+				float correction = (-tb - rootd) * 0.5f * invcyla;
+				t = tcentre + correction;
+
+				float3 dp_st = (3 * curve_coef[3] * i_st + 2 * curve_coef[2]) * i_st + curve_coef[1];
+				if(dot(tg, dp_st)< 0)
+					dp_st *= -1;
+				float3 dp_en = (3 * curve_coef[3] * i_en + 2 * curve_coef[2]) * i_en + curve_coef[1];
+				if(dot(tg, dp_en) < 0)
+					dp_en *= -1;
+
+				if(flags & CURVE_KN_BACKFACING && (dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f)) {
+					correction = (-tb + rootd) * 0.5f * invcyla;
+					t = tcentre + correction;
+				}
+
+				if(dot(dp_st, -p_st) + t * dp_st.z < 0 || dot(dp_en, p_en) - t * dp_en.z < 0 || isect->t < t || t <= 0.0f) {
+					tree++;
+					level = tree & -tree;
+					continue;
+				}
+
+				float w = (zcentre + (tg.z * correction)) * invl;
+				w = saturate(w);
+				/* compute u on the curve segment */
+				u = i_st * (1 - w) + i_en * w;
+
+				/* stochastic fade from minimum width */
+				if(difl != 0.0f && lcg_state) {
+					r_curr = r1 + (r2 - r1) * w;
+					r_ext = or1 + (or2 - or1) * w;
+					coverage = r_curr/r_ext;
+
+					if(coverage != 1.0f && (lcg_step_float(lcg_state) > coverage))
+						return hit;
+				}
+			}
+			/* we found a new intersection */
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->t = t;
+				isect->u = u;
+				isect->v = gd;
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+				hit = true;
+			}
+
+			tree++;
+			level = tree & -tree;
+		}
+		else {
+			/* split the curve into two curves and process */
+			level = level >> 1;
+		}
+	}
+
+	return hit;
+}
+
+ccl_device_curveintersect bool curve_intersect(KernelGlobals *kg,
+                                               Intersection *isect,
+                                               float3 P,
+                                               float3 direction,
+                                               uint visibility,
+                                               int object,
+                                               int curveAddr,
+                                               float time,
+                                               int type,
+                                               uint *lcg_state,
+                                               float difl,
+                                               float extmax)
+{
+	/* define few macros to minimize code duplication for SSE */
+#ifndef __KERNEL_SSE2__
+#  define len3_squared(x) len_squared(x)
+#  define len3(x) len(x)
+#  define dot3(x, y) dot(x, y)
+#endif
+
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
+	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
+	/* curve Intersection check */
+	int flags = kernel_data.curve.curveflags;
+
+	int prim = kernel_tex_fetch(__prim_index, curveAddr);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int cnum = __float_as_int(v00.x);
+	int k0 = cnum + segment;
+	int k1 = k0 + 1;
+
+#ifndef __KERNEL_SSE2__
+	float4 P_curve[2];
+
+	if(is_curve_primitive) {
+		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
+		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, P_curve);
+	}
+
+	float or1 = P_curve[0].w;
+	float or2 = P_curve[1].w;
+	float3 p1 = float4_to_float3(P_curve[0]);
+	float3 p2 = float4_to_float3(P_curve[1]);
+
+	/* minimum width extension */
+	float r1 = or1;
+	float r2 = or2;
+	float3 dif = P - p1;
+	float3 dif_second = P - p2;
+	if(difl != 0.0f) {
+		float pixelsize = min(len3(dif) * difl, extmax);
+		r1 = or1 < pixelsize ? pixelsize : or1;
+		pixelsize = min(len3(dif_second) * difl, extmax);
+		r2 = or2 < pixelsize ? pixelsize : or2;
+	}
+	/* --- */
+
+	float3 p21_diff = p2 - p1;
+	float3 sphere_dif1 = (dif + dif_second) * 0.5f;
+	float3 dir = direction;
+	float sphere_b_tmp = dot3(dir, sphere_dif1);
+	float3 sphere_dif2 = sphere_dif1 - sphere_b_tmp * dir;
+#else
+	ssef P_curve[2];
+
+	if(is_curve_primitive) {
+		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
+		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
+	}
+	else {
+		int fobject = (object == OBJECT_NONE)? kernel_tex_fetch(__prim_object, curveAddr): object;
+		motion_curve_keys(kg, fobject, prim, time, k0, k1, (float4*)&P_curve);
+	}
+
+	const ssef or12 = shuffle<3, 3, 3, 3>(P_curve[0], P_curve[1]);
+
+	ssef r12 = or12;
+	const ssef vP = load4f(P);
+	const ssef dif = vP - P_curve[0];
+	const ssef dif_second = vP - P_curve[1];
+	if(difl != 0.0f) {
+		const ssef len1_sq = len3_squared_splat(dif);
+		const ssef len2_sq = len3_squared_splat(dif_second);
+		const ssef len12 = mm_sqrt(shuffle<0, 0, 0, 0>(len1_sq, len2_sq));
+		const ssef pixelsize12 = min(len12 * difl, ssef(extmax));
+		r12 = max(or12, pixelsize12);
+	}
+	float or1 = extract<0>(or12), or2 = extract<0>(shuffle<2>(or12));
+	float r1 = extract<0>(r12), r2 = extract<0>(shuffle<2>(r12));
+
+	const ssef p21_diff = P_curve[1] - P_curve[0];
+	const ssef sphere_dif1 = (dif + dif_second) * 0.5f;
+	const ssef dir = load4f(direction);
+	const ssef sphere_b_tmp = dot3_splat(dir, sphere_dif1);
+	const ssef sphere_dif2 = nmadd(sphere_b_tmp, dir, sphere_dif1);
+#endif
+
+	float mr = max(r1, r2);
+	float l = len3(p21_diff);
+	float invl = 1.0f / l;
+	float sp_r = mr + 0.5f * l;
+
+	float sphere_b = dot3(dir, sphere_dif2);
+	float sdisc = sphere_b * sphere_b - len3_squared(sphere_dif2) + sp_r * sp_r;
+
+	if(sdisc < 0.0f)
+		return false;
+
+	/* obtain parameters and test midpoint distance for suitable modes */
+#ifndef __KERNEL_SSE2__
+	float3 tg = p21_diff * invl;
+#else
+	const ssef tg = p21_diff * invl;
+#endif
+	float gd = (r2 - r1) * invl;
+
+	float dirz = dot3(dir, tg);
+	float difz = dot3(dif, tg);
+
+	float a = 1.0f - (dirz*dirz*(1 + gd*gd));
+
+	float halfb = dot3(dir, dif) - dirz*(difz + gd*(difz*gd + r1));
+
+	float tcentre = -halfb/a;
+	float zcentre = difz + (dirz * tcentre);
+
+	if((tcentre > isect->t) && !(flags & CURVE_KN_ACCURATE))
+		return false;
+	if((zcentre < 0 || zcentre > l) && !(flags & CURVE_KN_ACCURATE) && !(flags & CURVE_KN_INTERSECTCORRECTION))
+		return false;
+
+	/* test minimum separation */
+#ifndef __KERNEL_SSE2__
+	float3 cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross(tg, dif));
+#else
+	const ssef cprod = cross(tg, dir);
+	float cprod2sq = len3_squared(cross_zxy(tg, dif));
+#endif
+	float cprodsq = len3_squared(cprod);
+	float distscaled = dot3(cprod, dif);
+
+	if(cprodsq == 0)
+		distscaled = cprod2sq;
+	else
+		distscaled = (distscaled*distscaled)/cprodsq;
+
+	if(distscaled > mr*mr)
+		return false;
+
+	/* calculate true intersection */
+#ifndef __KERNEL_SSE2__
+	float3 tdif = dif + tcentre * dir;
+#else
+	const ssef tdif = madd(ssef(tcentre), dir, dif);
+#endif
+	float tdifz = dot3(tdif, tg);
+	float tdifma = tdifz*gd + r1;
+	float tb = 2*(dot3(dir, tdif) - dirz*(tdifz + gd*tdifma));
+	float tc = dot3(tdif, tdif) - tdifz*tdifz - tdifma*tdifma;
+	float td = tb*tb - 4*a*tc;
+
+	if(td < 0.0f)
+		return false;
+
+	float rootd = 0.0f;
+	float correction = 0.0f;
+	if(flags & CURVE_KN_ACCURATE) {
+		rootd = sqrtf(td);
+		correction = ((-tb - rootd)/(2*a));
+	}
+
+	float t = tcentre + correction;
+
+	if(t < isect->t) {
+
+		if(flags & CURVE_KN_INTERSECTCORRECTION) {
+			rootd = sqrtf(td);
+			correction = ((-tb - rootd)/(2*a));
+			t = tcentre + correction;
+		}
+
+		float z = zcentre + (dirz * correction);
+		// bool backface = false;
+
+		if(flags & CURVE_KN_BACKFACING && (t < 0.0f || z < 0 || z > l)) {
+			// backface = true;
+			correction = ((-tb + rootd)/(2*a));
+			t = tcentre + correction;
+			z = zcentre + (dirz * correction);
+		}
+
+		/* stochastic fade from minimum width */
+		float adjradius = or1 + z * (or2 - or1) * invl;
+		adjradius = adjradius / (r1 + z * gd);
+		if(lcg_state && adjradius != 1.0f) {
+			if(lcg_step_float(lcg_state) > adjradius)
+				return false;
+		}
+		/* --- */
+
+		if(t > 0.0f && t < isect->t && z >= 0 && z <= l) {
+
+			if(flags & CURVE_KN_ENCLOSEFILTER) {
+				float enc_ratio = 1.01f;
+				if((difz > -r1 * enc_ratio) && (dot3(dif_second, tg) < r2 * enc_ratio)) {
+					float a2 = 1.0f - (dirz*dirz*(1 + gd*gd*enc_ratio*enc_ratio));
+					float c2 = dot3(dif, dif) - difz * difz * (1 + gd*gd*enc_ratio*enc_ratio) - r1*r1*enc_ratio*enc_ratio - 2*r1*difz*gd*enc_ratio;
+					if(a2*c2 < 0.0f)
+						return false;
+				}
+			}
+
+#ifdef __VISIBILITY_FLAG__
+			/* visibility flag test. we do it here under the assumption
+			 * that most triangles are culled by node flags */
+			if(kernel_tex_fetch(__prim_visibility, curveAddr) & visibility)
+#endif
+			{
+				/* record intersection */
+				isect->t = t;
+				isect->u = z*invl;
+				isect->v = gd;
+				isect->prim = curveAddr;
+				isect->object = object;
+				isect->type = type;
+
+				return true;
+			}
+		}
+	}
+
+	return false;
+
+#ifndef __KERNEL_SSE2__
+#  undef len3_squared
+#  undef len3
+#  undef dot3
+#endif
+}
+
+ccl_device_inline float3 curvetangent(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float fc = 0.71f;
+	float data[4];
+	float t2 = t * t;
+	data[0] = -3.0f * fc          * t2  + 4.0f * fc * t                  - fc;
+	data[1] =  3.0f * (2.0f - fc) * t2  + 2.0f * (fc - 3.0f) * t;
+	data[2] =  3.0f * (fc - 2.0f) * t2  + 2.0f * (3.0f - 2.0f * fc) * t  + fc;
+	data[3] =  3.0f * fc          * t2  - 2.0f * fc * t;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curvepoint(float t, float3 p0, float3 p1, float3 p2, float3 p3)
+{
+	float data[4];
+	float fc = 0.71f;
+	float t2 = t * t;
+	float t3 = t2 * t;
+	data[0] = -fc          * t3  + 2.0f * fc          * t2 - fc * t;
+	data[1] =  (2.0f - fc) * t3  + (fc - 3.0f)        * t2 + 1.0f;
+	data[2] =  (fc - 2.0f) * t3  + (3.0f - 2.0f * fc) * t2 + fc * t;
+	data[3] =  fc          * t3  - fc * t2;
+	return data[0] * p0 + data[1] * p1 + data[2] * p2 + data[3] * p3;
+}
+
+ccl_device_inline float3 curve_refine(KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      const Intersection *isect,
+                                      const Ray *ray)
+{
+	int flag = kernel_data.curve.curveflags;
+	float t = isect->t;
+	float3 P = ray->P;
+	float3 D = ray->D;
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_itfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+		D = transform_direction(&tfm, D*t);
+		D = normalize_len(D, &t);
+	}
+
+	int prim = kernel_tex_fetch(__prim_index, isect->prim);
+	float4 v00 = kernel_tex_fetch(__curves, prim);
+
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
+	int k1 = k0 + 1;
+
+	float3 tg;
+
+	if(flag & CURVE_KN_INTERPOLATE) {
+		int ka = max(k0 - 1,__float_as_int(v00.x));
+		int kb = min(k1 + 1,__float_as_int(v00.x) + __float_as_int(v00.y) - 1);
+
+		float4 P_curve[4];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
+			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
+			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
+			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
+		}
+		else {
+			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
+		}
+
+		float3 p[4];
+		p[0] = float4_to_float3(P_curve[0]);
+		p[1] = float4_to_float3(P_curve[1]);
+		p[2] = float4_to_float3(P_curve[2]);
+		p[3] = float4_to_float3(P_curve[3]);
+
+		P = P + D*t;
+
+#ifdef __UV__
+		sd->u = isect->u;
+		sd->v = 0.0f;
+#endif
+
+		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
+
+		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
+		}
+		else {
+			/* direction from inside to surface of curve */
+			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);
+			sd->Ng = normalize(P - p_curr);
+
+			/* adjustment for changing radius */
+			float gd = isect->v;
+
+			if(gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		/* todo: sometimes the normal is still so that this is detected as
+		 * backfacing even if cull backfaces is enabled */
+
+		sd->N = sd->Ng;
+	}
+	else {
+		float4 P_curve[2];
+
+		if(sd->type & PRIMITIVE_CURVE) {
+			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
+			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
+		}
+		else {
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
+		}
+
+		float l = 1.0f;
+		tg = normalize_len(float4_to_float3(P_curve[1] - P_curve[0]), &l);
+
+		P = P + D*t;
+
+		float3 dif = P - float4_to_float3(P_curve[0]);
+
+#ifdef __UV__
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
+#endif
+
+		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
+		}
+		else {
+			float gd = isect->v;
+
+			/* direction from inside to surface of curve */
+			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
+
+			/* adjustment for changing radius */
+			if(gd != 0.0f) {
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
+			}
+		}
+
+		sd->N = sd->Ng;
+	}
+
+#ifdef __DPDU__
+	/* dPdu/dPdv */
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
+#endif
+
+	if(isect->object != OBJECT_NONE) {
+#ifdef __OBJECT_MOTION__
+		Transform tfm = sd->ob_tfm;
+#else
+		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
+#endif
+
+		P = transform_point(&tfm, P);
+	}
+
+	return P;
+}
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index 6ecdfe0173a..1ffc143be34 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -415,12 +415,7 @@ ccl_device_inline float3 bvh_clamp_direction(float3 dir)
 
 ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
 	return rcp(dir);
-#else
-	return 1.0f / dir;
-#endif
 }
 
 /* Transform ray into object space to enter static object in BVH */
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 1e0ef5201c9..698cd6b03fd 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -35,10 +35,10 @@ ccl_device float4 volume_image_texture_3d(int id, float x, float y, float z)
 	float4 r;
 	switch(id) {
 		case 0: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_000, x, y, z); break;
-		case 1: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_001, x, y, z); break;
-		case 2: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_002, x, y, z); break;
-		case 3: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_003, x, y, z); break;
-		case 4: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_004, x, y, z); break;
+		case 8: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_008, x, y, z); break;
+		case 16: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_016, x, y, z); break;
+		case 24: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_024, x, y, z); break;
+		case 32: r = kernel_tex_image_interp_3d(__tex_image_float4_3d_032, x, y, z); break;
 	}
 	return r;
 }
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 175bd6b9737..ae5f6e5e070 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -21,6 +21,9 @@ CCL_NAMESPACE_BEGIN
  * BSDF evaluation result, split per BSDF type. This is used to accumulate
  * render passes separately. */
 
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg,
+                                           const ShaderData *sd);
+
 ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 value, int use_light_pass)
 {
 #ifdef __PASSES__
@@ -178,7 +181,6 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 
 	if(use_light_pass) {
 		L->indirect = make_float3(0.0f, 0.0f, 0.0f);
-		L->direct_throughput = make_float3(0.0f, 0.0f, 0.0f);
 		L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->color_diffuse = make_float3(0.0f, 0.0f, 0.0f);
@@ -199,57 +201,78 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 		L->indirect_subsurface = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect_scatter = make_float3(0.0f, 0.0f, 0.0f);
 
-		L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
-
+		L->transparent = 0.0f;
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->background = make_float3(0.0f, 0.0f, 0.0f);
 		L->ao = make_float3(0.0f, 0.0f, 0.0f);
 		L->shadow = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 		L->mist = 0.0f;
+
+		L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.direct = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else
 #endif
 	{
+		L->transparent = 0.0f;
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 	}
 
 #ifdef __SHADOW_TRICKS__
 	L->path_total = make_float3(0.0f, 0.0f, 0.0f);
 	L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
-	L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_background_color = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_throughput = 0.0f;
+	L->shadow_transparency = 1.0f;
+	L->has_shadow_catcher = 0;
 #endif
 
 #ifdef __DENOISING_FEATURES__
 	L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
 	L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
 	L->denoising_depth = 0.0f;
-#endif  /* __DENOISING_FEATURES__ */
+#endif
+
+#ifdef __KERNEL_DEBUG__
+	L->debug_data.num_bvh_traversed_nodes = 0;
+	L->debug_data.num_bvh_traversed_instances = 0;
+	L->debug_data.num_bvh_intersections = 0;
+	L->debug_data.num_ray_bounces = 0;
+#endif
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
-	BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
+ccl_device_inline void path_radiance_bsdf_bounce(
+	KernelGlobals *kg,
+	PathRadianceState *L_state,
+	ccl_addr_space float3 *throughput,
+	BsdfEval *bsdf_eval,
+	float bsdf_pdf, int bounce, int bsdf_label)
 {
 	float inverse_pdf = 1.0f/bsdf_pdf;
 
 #ifdef __PASSES__
-	if(L->use_light_pass) {
+	if(kernel_data.film.use_light_pass) {
 		if(bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
 			/* first on directly visible surface */
 			float3 value = *throughput*inverse_pdf;
 
-			L->path_diffuse = bsdf_eval->diffuse*value;
-			L->path_glossy = bsdf_eval->glossy*value;
-			L->path_transmission = bsdf_eval->transmission*value;
-			L->path_subsurface = bsdf_eval->subsurface*value;
-			L->path_scatter = bsdf_eval->scatter*value;
-
-			*throughput = L->path_diffuse + L->path_glossy + L->path_transmission + L->path_subsurface + L->path_scatter;
+			L_state->diffuse = bsdf_eval->diffuse*value;
+			L_state->glossy = bsdf_eval->glossy*value;
+			L_state->transmission = bsdf_eval->transmission*value;
+			L_state->subsurface = bsdf_eval->subsurface*value;
+			L_state->scatter = bsdf_eval->scatter*value;
+
+			*throughput = L_state->diffuse +
+			              L_state->glossy +
+			              L_state->transmission +
+			              L_state->subsurface +
+			              L_state->scatter;
 			
-			L->direct_throughput = *throughput;
+			L_state->direct = *throughput;
 		}
 		else {
 			/* transparent bounce before first hit, or indirectly visible through BSDF */
@@ -264,13 +287,22 @@ ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space
 	}
 }
 
-ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_emission(PathRadiance *L,
+                                                    ccl_addr_space PathState *state,
+                                                    float3 throughput,
+                                                    float3 value)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		return;
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0)
+		if(state->bounce == 0)
 			L->emission += throughput*value;
-		else if(bounce == 1)
+		else if(state->bounce == 1)
 			L->direct_emission += throughput*value;
 		else
 			L->indirect += throughput*value;
@@ -289,6 +321,18 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
                                               float3 bsdf,
                                               float3 ao)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf;
+		L->path_total += light;
+		L->path_total_shaded += ao * light;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
 		if(state->bounce == 0) {
@@ -306,14 +350,6 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
 	{
 		L->emission += throughput*bsdf*ao;
 	}
-
-#ifdef __SHADOW_TRICKS__
-	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-		float3 light = throughput * bsdf;
-		L->path_total += light;
-		L->path_total_shaded += ao * light;
-	}
-#endif
 }
 
 ccl_device_inline void path_radiance_accum_total_ao(
@@ -342,6 +378,18 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
                                                  float shadow_fac,
                                                  bool is_lamp)
 {
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf_eval->sum_no_mis;
+		L->path_total += light;
+		L->path_total_shaded += shadow * light;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
 		if(state->bounce == 0) {
@@ -368,14 +416,6 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
 	{
 		L->emission += throughput*bsdf_eval->diffuse*shadow;
 	}
-
-#ifdef __SHADOW_TRICKS__
-	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-		float3 light = throughput * bsdf_eval->sum_no_mis;
-		L->path_total += light;
-		L->path_total_shaded += shadow * light;
-	}
-#endif
 }
 
 ccl_device_inline void path_radiance_accum_total_light(
@@ -396,11 +436,24 @@ ccl_device_inline void path_radiance_accum_total_light(
 #endif
 }
 
-ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
-                                                      ccl_addr_space PathState *state,
-                                                      float3 throughput,
-                                                      float3 value)
+ccl_device_inline void path_radiance_accum_background(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float3 value)
 {
+
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * value;
+		L->path_total_shaded += throughput * value * L->shadow_transparency;
+
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			return;
+		}
+	}
+#endif
+
 #ifdef __PASSES__
 	if(L->use_light_pass) {
 		if(state->bounce == 0)
@@ -416,20 +469,31 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
 		L->emission += throughput*value;
 	}
 
-#ifdef __SHADOW_TRICKS__
-	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-		L->path_total += throughput * value;
-		if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
-			L->path_total_shaded += throughput * value;
-		}
-	}
-#endif
-
 #ifdef __DENOISING_FEATURES__
 	L->denoising_albedo += state->denoising_feature_weight * value;
 #endif  /* __DENOISING_FEATURES__ */
 }
 
+ccl_device_inline void path_radiance_accum_transparent(
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        float3 throughput)
+{
+	L->transparent += average(throughput);
+}
+
+#ifdef __SHADOW_TRICKS__
+ccl_device_inline void path_radiance_accum_shadowcatcher(
+        PathRadiance *L,
+        float3 throughput,
+        float3 background)
+{
+	L->shadow_throughput += average(throughput);
+	L->shadow_background_color += throughput * background;
+	L->has_shadow_catcher = 1;
+}
+#endif
+
 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 {
 #ifdef __PASSES__
@@ -437,19 +501,19 @@ ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
 	 * only a single throughput further along the path, here we recover just
 	 * the indirect path that is not influenced by any particular BSDF type */
 	if(L->use_light_pass) {
-		L->direct_emission = safe_divide_color(L->direct_emission, L->direct_throughput);
-		L->direct_diffuse += L->path_diffuse*L->direct_emission;
-		L->direct_glossy += L->path_glossy*L->direct_emission;
-		L->direct_transmission += L->path_transmission*L->direct_emission;
-		L->direct_subsurface += L->path_subsurface*L->direct_emission;
-		L->direct_scatter += L->path_scatter*L->direct_emission;
-
-		L->indirect = safe_divide_color(L->indirect, L->direct_throughput);
-		L->indirect_diffuse += L->path_diffuse*L->indirect;
-		L->indirect_glossy += L->path_glossy*L->indirect;
-		L->indirect_transmission += L->path_transmission*L->indirect;
-		L->indirect_subsurface += L->path_subsurface*L->indirect;
-		L->indirect_scatter += L->path_scatter*L->indirect;
+		L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
+		L->direct_diffuse += L->state.diffuse*L->direct_emission;
+		L->direct_glossy += L->state.glossy*L->direct_emission;
+		L->direct_transmission += L->state.transmission*L->direct_emission;
+		L->direct_subsurface += L->state.subsurface*L->direct_emission;
+		L->direct_scatter += L->state.scatter*L->direct_emission;
+
+		L->indirect = safe_divide_color(L->indirect, L->state.direct);
+		L->indirect_diffuse += L->state.diffuse*L->indirect;
+		L->indirect_glossy += L->state.glossy*L->indirect;
+		L->indirect_transmission += L->state.transmission*L->indirect;
+		L->indirect_subsurface += L->state.subsurface*L->indirect;
+		L->indirect_scatter += L->state.scatter*L->indirect;
 	}
 #endif
 }
@@ -458,11 +522,11 @@ ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		L->path_diffuse = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_glossy = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_transmission = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_subsurface = make_float3(0.0f, 0.0f, 0.0f);
-		L->path_scatter = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.diffuse = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.glossy = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.transmission = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.subsurface = make_float3(0.0f, 0.0f, 0.0f);
+		L->state.scatter = make_float3(0.0f, 0.0f, 0.0f);
 
 		L->direct_emission = make_float3(0.0f, 0.0f, 0.0f);
 		L->indirect = make_float3(0.0f, 0.0f, 0.0f);
@@ -475,11 +539,7 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L,
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		L->path_diffuse = L_src->path_diffuse;
-		L->path_glossy = L_src->path_glossy;
-		L->path_transmission = L_src->path_transmission;
-		L->path_subsurface = L_src->path_subsurface;
-		L->path_scatter = L_src->path_scatter;
+		L->state = L_src->state;
 
 		L->direct_emission = L_src->direct_emission;
 		L->indirect = L_src->indirect;
@@ -487,7 +547,36 @@ ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L,
 #endif
 }
 
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L)
+#ifdef __SHADOW_TRICKS__
+ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
+                                                       PathRadiance *L,
+                                                       float3 *L_sum,
+                                                       float *alpha)
+{
+	/* Calculate current shadow of the path. */
+	float path_total = average(L->path_total);
+	float shadow;
+
+	if(path_total == 0.0f) {
+		shadow = L->shadow_transparency;
+	}
+	else {
+		float path_total_shaded = average(L->path_total_shaded);
+		shadow = path_total_shaded / path_total;
+	}
+
+	/* Calculate final light sum and transparency for shadow catcher object. */
+	if(kernel_data.background.transparent) {
+		*alpha -= L->shadow_throughput * shadow;
+	}
+	else {
+		L->shadow_background_color *= shadow;
+		*L_sum += L->shadow_background_color;
+	}
+}
+#endif
+
+ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadiance *L, float *alpha)
 {
 	float3 L_sum;
 	/* Light Passes are used */
@@ -564,8 +653,6 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 			L_sum = L_direct + L_indirect;
 		}
 #endif
-
-		return L_sum;
 	}
 
 	/* No Light Passes */
@@ -573,14 +660,24 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 #endif
 	{
 		L_sum = L->emission;
+
+		/* Reject invalid value */
+		float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
+		if(!isfinite_safe(sum)) {
+			kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
+			L_sum = make_float3(0.0f, 0.0f, 0.0f);
+		}
 	}
 
-	/* Reject invalid value */
-	float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-	if(!isfinite_safe(sum)) {
-		kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
-		L_sum = make_float3(0.0f, 0.0f, 0.0f);
+	/* Compute alpha. */
+	*alpha = 1.0f - L->transparent;
+
+	/* Add shadow catcher contributions. */
+#ifdef __SHADOW_TRICKS__
+	if(L->has_shadow_catcher) {
+		path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
 	}
+#endif  /* __SHADOW_TRICKS__ */
 
 	return L_sum;
 }
@@ -613,14 +710,18 @@ ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadi
 	*clean = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
+#ifdef __SHADOW_TRICKS__
+	if(L->has_shadow_catcher) {
+		*noisy += L->shadow_background_color;
+	}
+#endif
+
 	*noisy = ensure_finite3(*noisy);
 	*clean = ensure_finite3(*clean);
 }
 
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
+ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
 {
-	float fac = 1.0f/num_samples;
-
 #ifdef __SPLIT_KERNEL__
 #  define safe_float3_add(f, v) \
 	do { \
@@ -629,65 +730,35 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
 		atomic_add_and_fetch_float(p+1, (v).y); \
 		atomic_add_and_fetch_float(p+2, (v).z); \
 	} while(0)
+#  define safe_float_add(f, v) \
+		atomic_add_and_fetch_float(&(f), (v))
 #else
 #  define safe_float3_add(f, v) (f) += (v)
+#  define safe_float_add(f, v) (f) += (v)
 #endif  /* __SPLIT_KERNEL__ */
 
 #ifdef __PASSES__
-	safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse*fac);
-	safe_float3_add(L->direct_glossy, L_sample->direct_glossy*fac);
-	safe_float3_add(L->direct_transmission, L_sample->direct_transmission*fac);
-	safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface*fac);
-	safe_float3_add(L->direct_scatter, L_sample->direct_scatter*fac);
-
-	safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse*fac);
-	safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy*fac);
-	safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission*fac);
-	safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface*fac);
-	safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter*fac);
-
-	safe_float3_add(L->background, L_sample->background*fac);
-	safe_float3_add(L->ao, L_sample->ao*fac);
-	safe_float3_add(L->shadow, L_sample->shadow*fac);
-#  ifdef __SPLIT_KERNEL__
-	atomic_add_and_fetch_float(&L->mist, L_sample->mist*fac);
-#  else
-	L->mist += L_sample->mist*fac;
-#  endif  /* __SPLIT_KERNEL__ */
+	safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
+	safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
+	safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
+	safe_float3_add(L->direct_subsurface, L_sample->direct_subsurface);
+	safe_float3_add(L->direct_scatter, L_sample->direct_scatter);
+
+	safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
+	safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
+	safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
+	safe_float3_add(L->indirect_subsurface, L_sample->indirect_subsurface);
+	safe_float3_add(L->indirect_scatter, L_sample->indirect_scatter);
+
+	safe_float3_add(L->background, L_sample->background);
+	safe_float3_add(L->ao, L_sample->ao);
+	safe_float3_add(L->shadow, L_sample->shadow);
+	safe_float_add(L->mist, L_sample->mist);
 #endif  /* __PASSES__ */
-	safe_float3_add(L->emission, L_sample->emission*fac);
+	safe_float3_add(L->emission, L_sample->emission);
 
+#undef safe_float_add
 #undef safe_float3_add
 }
 
-#ifdef __SHADOW_TRICKS__
-/* Calculate current shadow of the path. */
-ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
-{
-	float path_total = average(L->path_total);
-	float path_total_shaded = average(L->path_total_shaded);
-	if(path_total != 0.0f) {
-		return path_total_shaded / path_total;
-	}
-	return 1.0f;
-}
-
-/* Calculate final light sum and transparency for shadow catcher object. */
-ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
-                                                         const PathRadiance *L,
-                                                         float* alpha)
-{
-	const float shadow = path_radiance_sum_shadow(L);
-	float3 L_sum;
-	if(kernel_data.background.transparent) {
-		*alpha = 1.0f-shadow;
-		L_sum = make_float3(0.0f, 0.0f, 0.0f);
-	}
-	else {
-		L_sum = L->shadow_color * shadow;
-	}
-	return L_sum;
-}
-#endif
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index f18d145f7cf..4d89839c46c 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void compute_light_pass(KernelGlobals *kg,
                                           ShaderData *sd,
                                           PathRadiance *L,
-                                          RNG rng,
+                                          uint rng_hash,
                                           int pass_filter,
                                           int sample)
 {
@@ -48,11 +48,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 	path_radiance_init(&L_sample, kernel_data.film.use_light_pass);
 
 	/* init path state */
-	path_state_init(kg, &emission_sd, &state, &rng, sample, NULL);
+	path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
 
 	/* evaluate surface shader */
-	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
-	shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
+	shader_eval_surface(kg, sd, &state, state.flag);
 
 	/* TODO, disable more closures we don't need besides transparent */
 	shader_bsdf_disable_transparency(kg, sd);
@@ -64,13 +63,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+			kernel_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput, shader_bsdf_alpha(kg, sd));
 		}
 
 		/* sample emission */
 		if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
 			float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			path_radiance_accum_emission(&L_sample, &state, throughput, emission);
 		}
 
 		bool is_sss_sample = false;
@@ -86,7 +85,6 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 			                                  &emission_sd,
 			                                  &L_sample,
 			                                  &state,
-			                                  &rng,
 			                                  &ray,
 			                                  &throughput,
 			                                  &ss_indirect))
@@ -101,13 +99,10 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 					kernel_path_indirect(kg,
 					                     &indirect_sd,
 					                     &emission_sd,
-					                     &rng,
 					                     &ray,
 					                     throughput,
-					                     state.num_samples,
 					                     &state,
 					                     &L_sample);
-					kernel_path_subsurface_accum_indirect(&ss_indirect, &L_sample);
 				}
 				is_sss_sample = true;
 			}
@@ -116,14 +111,14 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample light and BSDF */
 		if(!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-			kernel_path_surface_connect_light(kg, &rng, sd, &emission_sd, throughput, &state, &L_sample);
+			kernel_path_surface_connect_light(kg, sd, &emission_sd, throughput, &state, &L_sample);
 
-			if(kernel_path_surface_bounce(kg, &rng, sd, &throughput, &state, &L_sample, &ray)) {
+			if(kernel_path_surface_bounce(kg, sd, &throughput, &state, &L_sample.state, &ray)) {
 #ifdef __LAMP_MIS__
 				state.ray_t = 0.0f;
 #endif
 				/* compute indirect light */
-				kernel_path_indirect(kg, &indirect_sd, &emission_sd, &rng, &ray, throughput, 1, &state, &L_sample);
+				kernel_path_indirect(kg, &indirect_sd, &emission_sd, &ray, throughput, &state, &L_sample);
 
 				/* sum and reset indirect light pass variables for the next samples */
 				path_radiance_sum_indirect(&L_sample);
@@ -137,13 +132,13 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 
 		/* sample ambient occlusion */
 		if(pass_filter & BAKE_FILTER_AO) {
-			kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, &rng, throughput);
+			kernel_branched_path_ao(kg, sd, &emission_sd, &L_sample, &state, throughput);
 		}
 
 		/* sample emission */
 		if((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
 			float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L_sample, throughput, emission, state.bounce);
+			path_radiance_accum_emission(&L_sample, &state, throughput, emission);
 		}
 
 #ifdef __SUBSURFACE__
@@ -151,7 +146,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 		if((pass_filter & BAKE_FILTER_SUBSURFACE) && (sd->flag & SD_BSSRDF)) {
 			/* when mixing BSSRDF and BSDF closures we should skip BSDF lighting if scattering was successful */
 			kernel_branched_path_subsurface_scatter(kg, sd, &indirect_sd,
-				&emission_sd, &L_sample, &state, &rng, &ray, throughput);
+				&emission_sd, &L_sample, &state, &ray, throughput);
 		}
 #endif
 
@@ -161,20 +156,20 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 			/* direct light */
 			if(kernel_data.integrator.use_direct_light) {
 				int all = kernel_data.integrator.sample_all_lights_direct;
-				kernel_branched_path_surface_connect_light(kg, &rng,
+				kernel_branched_path_surface_connect_light(kg,
 					sd, &emission_sd, &state, throughput, 1.0f, &L_sample, all);
 			}
 #endif
 
 			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, &rng,
+			kernel_branched_path_surface_indirect_light(kg,
 				sd, &indirect_sd, &emission_sd, throughput, 1.0f, &state, &L_sample);
 		}
 	}
 #endif
 
 	/* accumulate into master L */
-	path_radiance_accum_sample(L, &L_sample, 1);
+	path_radiance_accum_sample(L, &L_sample);
 }
 
 ccl_device bool is_aa_pass(ShaderEvalType type)
@@ -225,7 +220,6 @@ ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
 
 ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
                                                        ShaderData *sd,
-                                                       RNG *rng,
                                                        PathState *state,
                                                        float3 direct,
                                                        float3 indirect,
@@ -245,12 +239,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
 		}
 		else {
 			/* surface color of the pass only */
-			shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN);
+			shader_eval_surface(kg, sd, state, 0);
 			return kernel_bake_shader_bsdf(kg, sd, type);
 		}
 	}
 	else {
-		shader_eval_surface(kg, sd, rng, state, 0.0f, 0, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, sd, state, 0);
 		color = kernel_bake_shader_bsdf(kg, sd, type);
 	}
 
@@ -292,14 +286,14 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 	int num_samples = kernel_data.integrator.aa_samples;
 
 	/* random number generator */
-	RNG rng = cmj_hash(offset + i, kernel_data.integrator.seed);
+	uint rng_hash = cmj_hash(offset + i, kernel_data.integrator.seed);
 
 	float filter_x, filter_y;
 	if(sample == 0) {
 		filter_x = filter_y = 0.5f;
 	}
 	else {
-		path_rng_2D(kg, &rng, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
+		path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
 	}
 
 	/* subpixel u/v offset */
@@ -335,18 +329,18 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 
 	/* light passes if we need more than color */
 	if(pass_filter & ~BAKE_FILTER_COLOR)
-		compute_light_pass(kg, &sd, &L, rng, pass_filter, sample);
+		compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
 
 	switch(type) {
 		/* data passes */
 		case SHADER_EVAL_NORMAL:
 		{
 			if((sd.flag & SD_HAS_BUMP)) {
-				shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_MAIN);
+				shader_eval_surface(kg, &sd, &state, 0);
 			}
 
-			/* compression: normal = (2 * color) - 1 */
-			out = sd.N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
+			/* encoding: normal = (2 * color) - 1 */
+			out = shader_bsdf_average_normal(kg, &sd) * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
 			break;
 		}
 		case SHADER_EVAL_UV:
@@ -356,7 +350,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		}
 		case SHADER_EVAL_EMISSION:
 		{
-			shader_eval_surface(kg, &sd, &rng, &state, 0.f, 0, SHADER_CONTEXT_EMISSION);
+			shader_eval_surface(kg, &sd, &state, 0);
 			out = shader_emissive_eval(kg, &sd);
 			break;
 		}
@@ -371,7 +365,8 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		case SHADER_EVAL_COMBINED:
 		{
 			if((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
-				out = path_radiance_clamp_and_sum(kg, &L);
+				float alpha;
+				out = path_radiance_clamp_and_sum(kg, &L, &alpha);
 				break;
 			}
 
@@ -409,7 +404,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_diffuse,
 			                                           L.indirect_diffuse,
@@ -421,7 +415,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_glossy,
 			                                           L.indirect_glossy,
@@ -433,7 +426,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		{
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_transmission,
 			                                           L.indirect_transmission,
@@ -446,7 +438,6 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 #ifdef __SUBSURFACE__
 			out = kernel_bake_evaluate_direct_indirect(kg,
 			                                           &sd,
-			                                           &rng,
 			                                           &state,
 			                                           L.direct_subsurface,
 			                                           L.indirect_subsurface,
@@ -480,7 +471,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 
 			/* evaluate */
 			int flag = 0; /* we can't know which type of BSDF this is for */
-			out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN);
+			out = shader_eval_background(kg, &sd, &state, flag);
 			break;
 		}
 		default:
@@ -524,7 +515,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
 
 		/* evaluate */
 		float3 P = sd.P;
-		shader_eval_displacement(kg, &sd, &state, SHADER_CONTEXT_MAIN);
+		shader_eval_displacement(kg, &sd, &state);
 		out = sd.P - P;
 
 		object_inverse_dir_transform(kg, &sd, &out);
@@ -552,7 +543,7 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
 
 		/* evaluate */
 		int flag = 0; /* we can't know which type of BSDF this is for */
-		out = shader_eval_background(kg, &sd, &state, flag, SHADER_CONTEXT_MAIN);
+		out = shader_eval_background(kg, &sd, &state, flag);
 	}
 	
 	/* write output */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 38708f7ff0b..1e2af9de8b3 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -38,11 +38,15 @@
 /* Qualifier wrappers for different names on different devices */
 
 #define ccl_device  __device__ __inline__
+#if __CUDA_ARCH__ < 300
+#  define ccl_device_inline  __device__ __inline__
 #  define ccl_device_forceinline  __device__ __forceinline__
-#if __CUDA_ARCH__ < 500
+#elif __CUDA_ARCH__ < 500
 #  define ccl_device_inline  __device__ __forceinline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #else
 #  define ccl_device_inline  __device__ __inline__
+#  define ccl_device_forceinline  __device__ __forceinline__
 #endif
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
@@ -53,6 +57,10 @@
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
+/* TODO(sergey): In theory we might use references with CUDA, however
+ * performance impact yet to be investigated.
+ */
+#define ccl_ref
 #define ccl_align(n) __align__(n)
 
 #define ATTR_FALLTHROUGH
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index 4836c290312..36d6031d042 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -42,6 +42,7 @@
 #define ccl_local_param __local
 #define ccl_private __private
 #define ccl_restrict restrict
+#define ccl_ref
 #define ccl_align(n) __attribute__((aligned(n)))
 
 #ifdef __SPLIT_KERNEL__
@@ -129,6 +130,7 @@
 #  define expf(x) native_exp(((float)(x)))
 #  define sqrtf(x) native_sqrt(((float)(x)))
 #  define logf(x) native_log(((float)(x)))
+#  define rcp(x)  native_recip(x)
 #else
 #  define sinf(x) sin(((float)(x)))
 #  define cosf(x) cos(((float)(x)))
@@ -136,11 +138,12 @@
 #  define expf(x) exp(((float)(x)))
 #  define sqrtf(x) sqrt(((float)(x)))
 #  define logf(x) log(((float)(x)))
+#  define rcp(x)  recip(x))
 #endif
 
 /* data lookup defines */
 #define kernel_data (*kg->data)
-#define kernel_tex_fetch(t, index) kg->t[index]
+#define kernel_tex_fetch(tex, index) ((ccl_global tex##_t*)(kg->buffers[kg->tex.buffer] + kg->tex.offset))[(index)]
 
 /* define NULL */
 #define NULL 0
diff --git a/intern/cycles/kernel/kernel_debug.h b/intern/cycles/kernel/kernel_debug.h
deleted file mode 100644
index 5647bbae5b5..00000000000
--- a/intern/cycles/kernel/kernel_debug.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void debug_data_init(DebugData *debug_data)
-{
-	debug_data->num_bvh_traversed_nodes = 0;
-	debug_data->num_bvh_traversed_instances = 0;
-	debug_data->num_bvh_intersections = 0;
-	debug_data->num_ray_bounces = 0;
-}
-
-ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
-                                                 ccl_global float *buffer,
-                                                 ccl_addr_space PathState *state,
-                                                 DebugData *debug_data,
-                                                 int sample)
-{
-	int flag = kernel_data.film.pass_flag;
-	if(flag & PASS_BVH_TRAVERSED_NODES) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes,
-		                        sample,
-		                        debug_data->num_bvh_traversed_nodes);
-	}
-	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
-		                        sample,
-		                        debug_data->num_bvh_traversed_instances);
-	}
-	if(flag & PASS_BVH_INTERSECTIONS) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections,
-		                        sample,
-		                        debug_data->num_bvh_intersections);
-	}
-	if(flag & PASS_RAY_BOUNCES) {
-		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
-		                        sample,
-		                        debug_data->num_ray_bounces);
-	}
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 9e7d51f23f5..45b8c6311e1 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -37,16 +37,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		ray.D = ls->D;
 		ray.P = ls->P;
 		ray.t = 1.0f;
-#  ifdef __OBJECT_MOTION__
 		ray.time = time;
-#  endif
 		ray.dP = differential3_zero();
 		ray.dD = dI;
 
 		shader_setup_from_background(kg, emission_sd, &ray);
 
 		path_state_modify_bounce(state, true);
-		eval = shader_eval_background(kg, emission_sd, state, 0, SHADER_CONTEXT_EMISSION);
+		eval = shader_eval_background(kg, emission_sd, state, 0);
 		path_state_modify_bounce(state, false);
 	}
 	else
@@ -72,7 +70,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
 		path_state_modify_bounce(state, true);
-		shader_eval_surface(kg, emission_sd, NULL, state, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+		shader_eval_surface(kg, emission_sd, state, 0);
 		path_state_modify_bounce(state, false);
 
 		/* evaluate emissive closure */
@@ -216,7 +214,7 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
+		float pdf = triangle_light_pdf(kg, sd, t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
@@ -319,7 +317,7 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg,
 #  endif
 
 	path_state_modify_bounce(state, true);
-	float3 L = shader_eval_background(kg, emission_sd, state, state->flag, SHADER_CONTEXT_EMISSION);
+	float3 L = shader_eval_background(kg, emission_sd, state, state->flag);
 	path_state_modify_bounce(state, false);
 
 #ifdef __BACKGROUND_MIS__
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index f95f0d98c52..9d55183d94b 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -23,6 +23,10 @@
 #  include "util/util_vector.h"
 #endif
 
+#ifdef __KERNEL_OPENCL__
+#  include "util/util_atomic.h"
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -109,11 +113,22 @@ typedef struct KernelGlobals {
 
 #ifdef __KERNEL_OPENCL__
 
+#  define KERNEL_TEX(type, ttype, name) \
+typedef type name##_t;
+#  include "kernel/kernel_textures.h"
+
+typedef struct tex_info_t {
+	uint buffer, padding;
+	uint64_t offset;
+	uint width, height, depth, options;
+} tex_info_t;
+
 typedef ccl_addr_space struct KernelGlobals {
 	ccl_constant KernelData *data;
+	ccl_global char *buffers[8];
 
 #  define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
+	tex_info_t name;
 #  include "kernel/kernel_textures.h"
 
 #  ifdef __SPLIT_KERNEL__
@@ -122,6 +137,57 @@ typedef ccl_addr_space struct KernelGlobals {
 #  endif
 } KernelGlobals;
 
+#define KERNEL_BUFFER_PARAMS \
+	ccl_global char *buffer0, \
+	ccl_global char *buffer1, \
+	ccl_global char *buffer2, \
+	ccl_global char *buffer3, \
+	ccl_global char *buffer4, \
+	ccl_global char *buffer5, \
+	ccl_global char *buffer6, \
+	ccl_global char *buffer7
+
+#define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
+
+ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
+{
+#ifdef __SPLIT_KERNEL__
+	if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#endif
+	{
+		kg->buffers[0] = buffer0;
+		kg->buffers[1] = buffer1;
+		kg->buffers[2] = buffer2;
+		kg->buffers[3] = buffer3;
+		kg->buffers[4] = buffer4;
+		kg->buffers[5] = buffer5;
+		kg->buffers[6] = buffer6;
+		kg->buffers[7] = buffer7;
+	}
+
+#  ifdef __SPLIT_KERNEL__
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#  endif
+}
+
+ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
+{
+#  ifdef __SPLIT_KERNEL__
+	if(ccl_local_id(0) + ccl_local_id(1) == 0)
+#  endif
+	{
+		ccl_global tex_info_t *info = (ccl_global tex_info_t*)kg->buffers[0];
+
+#  define KERNEL_TEX(type, ttype, name) \
+		kg->name = *(info++);
+#  include "kernel/kernel_textures.h"
+	}
+
+#  ifdef __SPLIT_KERNEL__
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+#  endif
+}
+
 #endif  /* __KERNEL_OPENCL__ */
 
 /* Interpolated lookup table access */
diff --git a/intern/cycles/kernel/kernel_image_opencl.h b/intern/cycles/kernel/kernel_image_opencl.h
index 90747e09357..9e3373432ec 100644
--- a/intern/cycles/kernel/kernel_image_opencl.h
+++ b/intern/cycles/kernel/kernel_image_opencl.h
@@ -15,30 +15,42 @@
  */
 
 
-/* For OpenCL all images are packed in a single array, and we do manual lookup
- * and interpolation. */
+/* For OpenCL we do manual lookup and interpolation. */
+
+ccl_device_inline ccl_global tex_info_t* kernel_tex_info(KernelGlobals *kg, uint id) {
+	const uint tex_offset = id
+#define KERNEL_TEX(type, ttype, name) + 1
+#include "kernel/kernel_textures.h"
+	;
+
+	return &((ccl_global tex_info_t*)kg->buffers[0])[tex_offset];
+}
+
+#define tex_fetch(type, info, index) ((ccl_global type*)(kg->buffers[info->buffer] + info->offset))[(index)]
 
 ccl_device_inline float4 svm_image_texture_read(KernelGlobals *kg, int id, int offset)
 {
+	const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
 	const int texture_type = kernel_tex_type(id);
+
 	/* Float4 */
 	if(texture_type == IMAGE_DATA_TYPE_FLOAT4) {
-		return kernel_tex_fetch(__tex_image_float4_packed, offset);
+		return tex_fetch(float4, info, offset);
 	}
 	/* Byte4 */
 	else if(texture_type == IMAGE_DATA_TYPE_BYTE4) {
-		uchar4 r = kernel_tex_fetch(__tex_image_byte4_packed, offset);
+		uchar4 r = tex_fetch(uchar4, info, offset);
 		float f = 1.0f/255.0f;
 		return make_float4(r.x*f, r.y*f, r.z*f, r.w*f);
 	}
 	/* Float */
 	else if(texture_type == IMAGE_DATA_TYPE_FLOAT) {
-		float f = kernel_tex_fetch(__tex_image_float_packed, offset);
+		float f = tex_fetch(float, info, offset);
 		return make_float4(f, f, f, 1.0f);
 	}
 	/* Byte */
 	else {
-		uchar r = kernel_tex_fetch(__tex_image_byte_packed, offset);
+		uchar r = tex_fetch(uchar, info, offset);
 		float f = r * (1.0f/255.0f);
 		return make_float4(f, f, f, 1.0f);
 	}
@@ -64,17 +76,17 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 	return x - (float)i;
 }
 
-ccl_device_inline uint kernel_decode_image_interpolation(uint4 info)
+ccl_device_inline uint kernel_decode_image_interpolation(uint info)
 {
-	return (info.w & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
+	return (info & (1 << 0)) ? INTERPOLATION_CLOSEST : INTERPOLATION_LINEAR;
 }
 
-ccl_device_inline uint kernel_decode_image_extension(uint4 info)
+ccl_device_inline uint kernel_decode_image_extension(uint info)
 {
-	if(info.w & (1 << 1)) {
+	if(info & (1 << 1)) {
 		return EXTENSION_REPEAT;
 	}
-	else if(info.w & (1 << 2)) {
+	else if(info & (1 << 2)) {
 		return EXTENSION_EXTEND;
 	}
 	else {
@@ -84,13 +96,16 @@ ccl_device_inline uint kernel_decode_image_extension(uint4 info)
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
+	const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
+
+	uint width = info->width;
+	uint height = info->height;
+	uint offset = 0;
+
 	/* Decode image options. */
-	uint interpolation = kernel_decode_image_interpolation(info);
-	uint extension = kernel_decode_image_extension(info);
+	uint interpolation = kernel_decode_image_interpolation(info->options);
+	uint extension = kernel_decode_image_extension(info->options);
+
 	/* Actual sampling. */
 	float4 r;
 	int ix, iy, nix, niy;
@@ -150,14 +165,17 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
 
 ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x, float y, float z)
 {
-	uint4 info = kernel_tex_fetch(__tex_image_packed_info, id*2);
-	uint width = info.x;
-	uint height = info.y;
-	uint offset = info.z;
-	uint depth = kernel_tex_fetch(__tex_image_packed_info, id*2+1).x;
+	const ccl_global tex_info_t *info = kernel_tex_info(kg, id);
+
+	uint width = info->width;
+	uint height = info->height;
+	uint offset = 0;
+	uint depth = info->depth;
+
 	/* Decode image options. */
-	uint interpolation = kernel_decode_image_interpolation(info);
-	uint extension = kernel_decode_image_extension(info);
+	uint interpolation = kernel_decode_image_interpolation(info->options);
+	uint extension = kernel_decode_image_extension(info->options);
+
 	/* Actual sampling. */
 	float4 r;
 	int ix, iy, iz, nix, niy, niz;
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 9baa9d54957..c806deee8e7 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -396,11 +396,13 @@ ccl_device_inline float3 background_light_sample(KernelGlobals *kg,
 					     + (1.0f - portal_sampling_pdf) * cdf_pdf);
 				}
 				return D;
-			} else {
+			}
+			else {
 				/* Sample map, but with nonzero portal_sampling_pdf for MIS. */
 				randu = (randu - portal_sampling_pdf) / (1.0f - portal_sampling_pdf);
 			}
-		} else {
+		}
+		else {
 			/* We can't sample a portal.
 			 * Check if we can sample the map instead.
 			 */
@@ -763,78 +765,280 @@ ccl_device bool lamp_light_eval(KernelGlobals *kg, int lamp, float3 P, float3 D,
 
 /* Triangle Light */
 
-ccl_device void object_transform_light_sample(KernelGlobals *kg, LightSample *ls, int object, float time)
+/* returns true if the triangle is has motion blur or an instancing transform applied */
+ccl_device_inline bool triangle_world_space_vertices(KernelGlobals *kg, int object, int prim, float time, float3 V[3])
 {
+	bool has_motion = false;
+	const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+	if(object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) {
+		motion_triangle_vertices(kg, object, prim, time, V);
+		has_motion = true;
+	}
+	else {
+		triangle_vertices(kg, prim, V);
+	}
+
 #ifdef __INSTANCING__
-	/* instance transform */
-	if(!(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED)) {
+	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #  ifdef __OBJECT_MOTION__
-		Transform itfm;
-		Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm);
+		Transform tfm = object_fetch_transform_motion_test(kg, object, time, NULL);
 #  else
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
 #  endif
-
-		ls->P = transform_point(&tfm, ls->P);
-		ls->Ng = normalize(transform_direction(&tfm, ls->Ng));
+		V[0] = transform_point(&tfm, V[0]);
+		V[1] = transform_point(&tfm, V[1]);
+		V[2] = transform_point(&tfm, V[2]);
+		has_motion = true;
 	}
 #endif
+	return has_motion;
 }
 
-ccl_device void triangle_light_sample(KernelGlobals *kg, int prim, int object,
-	float randu, float randv, float time, LightSample *ls)
+ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
-	float u, v;
+	float pdf = kernel_data.integrator.pdf_triangles;
+	float cos_pi = fabsf(dot(Ng, I));
 
-	/* compute random point in triangle */
-	randu = sqrtf(randu);
+	if(cos_pi == 0.0f)
+		return 0.0f;
+
+	return t*t*pdf/cos_pi;
+}
 
-	u = 1.0f - randu;
-	v = randv*randu;
+ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+{
+	/* A naive heuristic to decide between costly solid angle sampling
+	 * and simple area sampling, comparing the distance to the triangle plane
+	 * to the length of the edges of the triangle. */
+
+	float3 V[3];
+	bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V);
+
+	const float3 e0 = V[1] - V[0];
+	const float3 e1 = V[2] - V[0];
+	const float3 e2 = V[2] - V[1];
+	const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+	const float3 N = cross(e0, e1);
+	const float distance_to_plane = fabsf(dot(N, sd->I * t))/dot(N, N);
+
+	if(longest_edge_squared > distance_to_plane*distance_to_plane) {
+		/* sd contains the point on the light source
+		 * calculate Px, the point that we're shading */
+		const float3 Px = sd->P + sd->I * t;
+		const float3 v0_p = V[0] - Px;
+		const float3 v1_p = V[1] - Px;
+		const float3 v2_p = V[2] - Px;
+
+		const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+		const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+		const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+		const float alpha = fast_acosf(dot(u02, u01));
+		const float beta = fast_acosf(-dot(u01, u12));
+		const float gamma = fast_acosf(dot(u02, u12));
+		const float solid_angle =  alpha + beta + gamma - M_PI_F;
+
+		/* pdf_triangles is calculated over triangle area, but we're not sampling over its area */
+		if(UNLIKELY(solid_angle == 0.0f)) {
+			return 0.0f;
+		}
+		else {
+			float area = 1.0f;
+			if(has_motion) {
+				/* get the center frame vertices, this is what the PDF was calculated from */
+				triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+				area = triangle_area(V[0], V[1], V[2]);
+			}
+			else {
+				area = 0.5f * len(N);
+			}
+			const float pdf = area * kernel_data.integrator.pdf_triangles;
+			return pdf / solid_angle;
+		}
+	}
+	else {
+		float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t);
+		if(has_motion) {
+			const float	area = 0.5f * len(N);
+			if(UNLIKELY(area == 0.0f)) {
+				return 0.0f;
+			}
+			/* scale the PDF.
+			 * area = the area the sample was taken from
+			 * area_pre = the are from which pdf_triangles was calculated from */
+			triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+			const float area_pre = triangle_area(V[0], V[1], V[2]);
+			pdf = pdf * area_pre / area;
+		}
+		return pdf;
+	}
+}
 
-	/* triangle, so get position, normal, shader */
-	triangle_point_normal(kg, object, prim, u, v, &ls->P, &ls->Ng, &ls->shader);
+ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg, int prim, int object,
+	float randu, float randv, float time, LightSample *ls, const float3 P)
+{
+	/* A naive heuristic to decide between costly solid angle sampling
+	 * and simple area sampling, comparing the distance to the triangle plane
+	 * to the length of the edges of the triangle. */
+
+	float3 V[3];
+	bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V);
+
+	const float3 e0 = V[1] - V[0];
+	const float3 e1 = V[2] - V[0];
+	const float3 e2 = V[2] - V[1];
+	const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+	const float3 N0 = cross(e0, e1);
+	float Nl = 0.0f;
+	ls->Ng = safe_normalize_len(N0, &Nl);
+	float area = 0.5f * Nl;
+
+	/* flip normal if necessary */
+	const int object_flag = kernel_tex_fetch(__object_flag, object);
+	if(object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+		ls->Ng = -ls->Ng;
+	}
+	ls->eval_fac = 1.0f;
+	ls->shader = kernel_tex_fetch(__tri_shader, prim);
 	ls->object = object;
 	ls->prim = prim;
 	ls->lamp = LAMP_NONE;
 	ls->shader |= SHADER_USE_MIS;
-	ls->t = 0.0f;
-	ls->u = u;
-	ls->v = v;
 	ls->type = LIGHT_TRIANGLE;
-	ls->eval_fac = 1.0f;
 
-	object_transform_light_sample(kg, ls, object, time);
-}
+	float distance_to_plane = fabsf(dot(N0, V[0] - P)/dot(N0, N0));
+
+	if(longest_edge_squared > distance_to_plane*distance_to_plane) {
+		/* see James Arvo, "Stratified Sampling of Spherical Triangles"
+		 * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
+
+		/* project the triangle to the unit sphere
+		 * and calculate its edges and angles */
+		const float3 v0_p = V[0] - P;
+		const float3 v1_p = V[1] - P;
+		const float3 v2_p = V[2] - P;
+
+		const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+		const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+		const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+		const float3 A = safe_normalize(v0_p);
+		const float3 B = safe_normalize(v1_p);
+		const float3 C = safe_normalize(v2_p);
+
+		const float cos_alpha = dot(u02, u01);
+		const float cos_beta = -dot(u01, u12);
+		const float cos_gamma = dot(u02, u12);
+
+		/* calculate dihedral angles */
+		const float alpha = fast_acosf(cos_alpha);
+		const float beta = fast_acosf(cos_beta);
+		const float gamma = fast_acosf(cos_gamma);
+		/* the area of the unit spherical triangle = solid angle */
+		const float solid_angle =  alpha + beta + gamma - M_PI_F;
+
+		/* precompute a few things
+		 * these could be re-used to take several samples
+		 * as they are independent of randu/randv */
+		const float cos_c = dot(A, B);
+		const float sin_alpha = fast_sinf(alpha);
+		const float product = sin_alpha * cos_c;
+
+		/* Select a random sub-area of the spherical triangle
+		 * and calculate the third vertex C_ of that new triangle */
+		const float phi = randu * solid_angle - alpha;
+		float s, t;
+		fast_sincosf(phi, &s, &t);
+		const float u = t - cos_alpha;
+		const float v = s + product;
+
+		const float3 U = safe_normalize(C - dot(C, A) * A);
+
+		float q = 1.0f;
+		const float det = ((v * s + u * t) * sin_alpha);
+		if(det != 0.0f) {
+			q = ((v * t - u * s) * cos_alpha - v) / det;
+		}
+		const float temp = max(1.0f - q*q, 0.0f);
 
-ccl_device float triangle_light_pdf(KernelGlobals *kg,
-	const float3 Ng, const float3 I, float t)
-{
-	float pdf = kernel_data.integrator.pdf_triangles;
-	float cos_pi = fabsf(dot(Ng, I));
+		const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U);
 
-	if(cos_pi == 0.0f)
-		return 0.0f;
-	
-	return t*t*pdf/cos_pi;
+		/* Finally, select a random point along the edge of the new triangle
+		 * That point on the spherical triangle is the sampled ray direction */
+		const float z = 1.0f - randv * (1.0f - dot(C_, B));
+		ls->D = z * B + safe_sqrtf(1.0f - z*z) * safe_normalize(C_ - dot(C_, B) * B);
+
+		/* calculate intersection with the planar triangle */
+		if(!ray_triangle_intersect(P, ls->D, FLT_MAX,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+		                           (ssef*)V,
+#else
+		                           V[0], V[1], V[2],
+#endif
+		                           &ls->u, &ls->v, &ls->t)) {
+			ls->pdf = 0.0f;
+			return;
+		}
+
+		ls->P = P + ls->D * ls->t;
+
+		/* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */
+		if(UNLIKELY(solid_angle == 0.0f)) {
+			ls->pdf = 0.0f;
+			return;
+		}
+		else {
+			if(has_motion) {
+				/* get the center frame vertices, this is what the PDF was calculated from */
+				triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+				area = triangle_area(V[0], V[1], V[2]);
+			}
+			const float pdf = area * kernel_data.integrator.pdf_triangles;
+			ls->pdf = pdf / solid_angle;
+		}
+	}
+	else {
+		/* compute random point in triangle */
+		randu = sqrtf(randu);
+
+		const float u = 1.0f - randu;
+		const float v = randv*randu;
+		const float t = 1.0f - u - v;
+		ls->P = u * V[0] + v * V[1] + t * V[2];
+		/* compute incoming direction, distance and pdf */
+		ls->D = normalize_len(ls->P - P, &ls->t);
+		ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t);
+		if(has_motion && area != 0.0f) {
+			/* scale the PDF.
+			 * area = the area the sample was taken from
+			 * area_pre = the are from which pdf_triangles was calculated from */
+			triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+			const float area_pre = triangle_area(V[0], V[1], V[2]);
+			ls->pdf = ls->pdf * area_pre / area;
+		}
+		ls->u = u;
+		ls->v = v;
+	}
 }
 
 /* Light Distribution */
 
-ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
+ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
 {
-	/* this is basically std::upper_bound as used by pbrt, to find a point light or
+	/* This is basically std::upper_bound as used by pbrt, to find a point light or
 	 * triangle to emit from, proportional to area. a good improvement would be to
 	 * also sample proportional to power, though it's not so well defined with
-	 * OSL shaders. */
+	 * arbitrary shaders. */
 	int first = 0;
 	int len = kernel_data.integrator.num_distribution + 1;
+	float r = *randu;
 
 	while(len > 0) {
 		int half_len = len >> 1;
 		int middle = first + half_len;
 
-		if(randt < kernel_tex_fetch(__light_distribution, middle).x) {
+		if(r < kernel_tex_fetch(__light_distribution, middle).x) {
 			len = half_len;
 		}
 		else {
@@ -843,9 +1047,17 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float randt)
 		}
 	}
 
-	/* clamping should not be needed but float rounding errors seem to
-	 * make this fail on rare occasions */
-	return clamp(first-1, 0, kernel_data.integrator.num_distribution-1);
+	/* Clamping should not be needed but float rounding errors seem to
+	 * make this fail on rare occasions. */
+	int index = clamp(first-1, 0, kernel_data.integrator.num_distribution-1);
+
+	/* Rescale to reuse random number. this helps the 2D samples within
+	 * each area light be stratified as well. */
+	float distr_min = kernel_tex_fetch(__light_distribution, index).x;
+	float distr_max = kernel_tex_fetch(__light_distribution, index+1).x;
+	*randu = (r - distr_min)/(distr_max - distr_min);
+
+	return index;
 }
 
 /* Generic Light */
@@ -857,7 +1069,6 @@ ccl_device bool light_select_reached_max_bounces(KernelGlobals *kg, int index, i
 }
 
 ccl_device_noinline bool light_sample(KernelGlobals *kg,
-                                      float randt,
                                       float randu,
                                       float randv,
                                       float time,
@@ -866,7 +1077,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
                                       LightSample *ls)
 {
 	/* sample index */
-	int index = light_distribution_sample(kg, randt);
+	int index = light_distribution_sample(kg, &randu);
 
 	/* fetch light data */
 	float4 l = kernel_tex_fetch(__light_distribution, index);
@@ -876,10 +1087,7 @@ ccl_device_noinline bool light_sample(KernelGlobals *kg,
 		int object = __float_as_int(l.w);
 		int shader_flag = __float_as_int(l.z);
 
-		triangle_light_sample(kg, prim, object, randu, randv, time, ls);
-		/* compute incoming direction, distance and pdf */
-		ls->D = normalize_len(ls->P - P, &ls->t);
-		ls->pdf = triangle_light_pdf(kg, ls->Ng, -ls->D, ls->t);
+		triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
 		ls->shader |= shader_flag;
 		return (ls->pdf > 0.0f);
 	}
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 9cd7ffb181d..fff7f4cfdb7 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -194,8 +194,38 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
 #endif  /* __DENOISING_FEATURES__ */
 }
 
+#ifdef __KERNEL_DEBUG__
+ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
+                                                 ccl_global float *buffer,
+                                                 PathRadiance *L,
+                                                 int sample)
+{
+	int flag = kernel_data.film.pass_flag;
+	if(flag & PASS_BVH_TRAVERSED_NODES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_nodes,
+		                        sample,
+		                        L->debug_data.num_bvh_traversed_nodes);
+	}
+	if(flag & PASS_BVH_TRAVERSED_INSTANCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_traversed_instances,
+		                        sample,
+		                        L->debug_data.num_bvh_traversed_instances);
+	}
+	if(flag & PASS_BVH_INTERSECTIONS) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_bvh_intersections,
+		                        sample,
+		                        L->debug_data.num_bvh_intersections);
+	}
+	if(flag & PASS_RAY_BOUNCES) {
+		kernel_write_pass_float(buffer + kernel_data.film.pass_ray_bounces,
+		                        sample,
+		                        L->debug_data.num_ray_bounces);
+	}
+}
+#endif /* __KERNEL_DEBUG__ */
+
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
-	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
+	ShaderData *sd, ccl_addr_space PathState *state, float3 throughput)
 {
 #ifdef __PASSES__
 	int path_flag = state->flag;
@@ -213,6 +243,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
+			int sample = state->sample;
 
 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
@@ -230,7 +261,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 			}
 
 			if(flag & PASS_NORMAL) {
-				float3 normal = sd->N;
+				float3 normal = shader_bsdf_average_normal(kg, sd);
 				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
 			}
 			if(flag & PASS_UV) {
@@ -334,19 +365,11 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
 }
 
 ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
-	int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+	int sample, PathRadiance *L)
 {
 	if(L) {
-		float3 L_sum;
-#ifdef __SHADOW_TRICKS__
-		if(is_shadow_catcher) {
-			L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
-		}
-		else
-#endif  /* __SHADOW_TRICKS__ */
-		{
-			L_sum = path_radiance_clamp_and_sum(kg, L);
-		}
+		float alpha;
+		float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
 
 		kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
 
@@ -361,16 +384,7 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *
 #  endif
 			if(kernel_data.film.pass_denoising_clean) {
 				float3 noisy, clean;
-#ifdef __SHADOW_TRICKS__
-				if(is_shadow_catcher) {
-					noisy = L_sum;
-					clean = make_float3(0.0f, 0.0f, 0.0f);
-				}
-				else
-#endif  /* __SHADOW_TRICKS__ */
-				{
-					path_radiance_split_denoising(kg, L, &noisy, &clean);
-				}
+				path_radiance_split_denoising(kg, L, &noisy, &clean);
 				kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
 				                                  sample, noisy);
 				kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
@@ -389,6 +403,11 @@ ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *
 			                                 sample, L->denoising_depth);
 		}
 #endif  /* __DENOISING_FEATURES__ */
+
+
+#ifdef __KERNEL_DEBUG__
+		kernel_write_debug_passes(kg, buffer, L, sample);
+#endif
 	}
 	else {
 		kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index fc093ad8319..793fede0deb 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -48,25 +48,308 @@
 #include "kernel/kernel_path_volume.h"
 #include "kernel/kernel_path_subsurface.h"
 
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline bool kernel_path_scene_intersect(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	Ray *ray,
+	Intersection *isect,
+	PathRadiance *L)
+{
+	uint visibility = path_state_ray_visibility(kg, state);
+
+#ifdef __HAIR__
+	float difl = 0.0f, extmax = 0.0f;
+	uint lcg_state = 0;
+
+	if(kernel_data.bvh.have_curves) {
+		if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) {
+			float3 pixdiff = ray->dD.dx + ray->dD.dy;
+			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
+		}
+
+		extmax = kernel_data.curve.maximum_width;
+		lcg_state = lcg_state_init_addrspace(state, 0x51633e2d);
+	}
+
+	if(path_state_ao_bounce(kg, state)) {
+		visibility = PATH_RAY_SHADOW;
+		ray->t = kernel_data.background.ao_distance;
+	}
+
+	bool hit = scene_intersect(kg, *ray, visibility, isect, &lcg_state, difl, extmax);
+#else
+	bool hit = scene_intersect(kg, *ray, visibility, isect, NULL, 0.0f, 0.0f);
+#endif  /* __HAIR__ */
+
 #ifdef __KERNEL_DEBUG__
-#  include "kernel/kernel_debug.h"
-#endif
+	if(state->flag & PATH_RAY_CAMERA) {
+		L->debug_data.num_bvh_traversed_nodes += isect->num_traversed_nodes;
+		L->debug_data.num_bvh_traversed_instances += isect->num_traversed_instances;
+		L->debug_data.num_bvh_intersections += isect->num_intersections;
+	}
+	L->debug_data.num_ray_bounces++;
+#endif  /* __KERNEL_DEBUG__ */
 
-CCL_NAMESPACE_BEGIN
+	return hit;
+}
+
+ccl_device_forceinline void kernel_path_lamp_emission(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	Ray *ray,
+	float3 throughput,
+	ccl_addr_space Intersection *isect,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+#ifdef __LAMP_MIS__
+	if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
+		/* ray starting from previous non-transparent bounce */
+		Ray light_ray;
+
+		light_ray.P = ray->P - state->ray_t*ray->D;
+		state->ray_t += isect->t;
+		light_ray.D = ray->D;
+		light_ray.t = state->ray_t;
+		light_ray.time = ray->time;
+		light_ray.dD = ray->dD;
+		light_ray.dP = ray->dP;
+
+		/* intersect with lamp */
+		float3 emission;
+
+		if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission))
+			path_radiance_accum_emission(L, state, throughput, emission);
+	}
+#endif  /* __LAMP_MIS__ */
+}
+
+ccl_device_forceinline void kernel_path_background(
+	KernelGlobals *kg,
+	ccl_addr_space PathState *state,
+	ccl_addr_space Ray *ray,
+	float3 throughput,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* eval background shader if nothing hit */
+	if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+		L->transparent += average(throughput);
+
+#ifdef __PASSES__
+		if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif  /* __PASSES__ */
+			return;
+	}
+
+#ifdef __BACKGROUND__
+	/* sample background shader */
+	float3 L_background = indirect_background(kg, emission_sd, state, ray);
+	path_radiance_accum_background(L, state, throughput, L_background);
+#endif  /* __BACKGROUND__ */
+}
+
+#ifndef __SPLIT_KERNEL__
+
+#ifdef __VOLUME__
+ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	PathState *state,
+	Ray *ray,
+	float3 *throughput,
+	ccl_addr_space Intersection *isect,
+	bool hit,
+	ShaderData *emission_sd,
+	PathRadiance *L)
+{
+	/* Sanitize volume stack. */
+	if(!hit) {
+		kernel_volume_clean_stack(kg, state->volume_stack);
+	}
+	/* volume attenuation, emission, scatter */
+	if(state->volume_stack[0].shader != SHADER_NONE) {
+		Ray volume_ray = *ray;
+		volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+		bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+#  ifdef __VOLUME_DECOUPLED__
+		int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
+		bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
+		bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, direct, sampling_method);
+
+		if(decoupled) {
+			/* cache steps along volume for repeated sampling */
+			VolumeSegment volume_segment;
+
+			shader_setup_from_volume(kg, sd, &volume_ray);
+			kernel_volume_decoupled_record(kg, state,
+				&volume_ray, sd, &volume_segment, heterogeneous);
+
+			volume_segment.sampling_method = sampling_method;
+
+			/* emission */
+			if(volume_segment.closure_flag & SD_EMISSION)
+				path_radiance_accum_emission(L, state, *throughput, volume_segment.accum_emission);
+
+			/* scattering */
+			VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+			if(volume_segment.closure_flag & SD_SCATTER) {
+				int all = kernel_data.integrator.sample_all_lights_indirect;
+
+				/* direct light sampling */
+				kernel_branched_path_volume_connect_light(kg, sd,
+					emission_sd, *throughput, state, L, all,
+					&volume_ray, &volume_segment);
+
+				/* indirect sample. if we use distance sampling and take just
+				 * one sample for direct and indirect light, we could share
+				 * this computation, but makes code a bit complex */
+				float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+				float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+
+				result = kernel_volume_decoupled_scatter(kg,
+					state, &volume_ray, sd, throughput,
+					rphase, rscatter, &volume_segment, NULL, true);
+			}
+
+			/* free cached steps */
+			kernel_volume_decoupled_free(kg, &volume_segment);
+
+			if(result == VOLUME_PATH_SCATTERED) {
+				if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+					return VOLUME_PATH_SCATTERED;
+				else
+					return VOLUME_PATH_MISSED;
+			}
+			else {
+				*throughput *= volume_segment.accum_transmittance;
+			}
+		}
+		else
+#  endif  /* __VOLUME_DECOUPLED__ */
+		{
+			/* integrate along volume segment with distance sampling */
+			VolumeIntegrateResult result = kernel_volume_integrate(
+				kg, state, sd, &volume_ray, L, throughput, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+			if(result == VOLUME_PATH_SCATTERED) {
+				/* direct lighting */
+				kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
+				/* indirect light bounce */
+				if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
+					return VOLUME_PATH_SCATTERED;
+				else
+					return VOLUME_PATH_MISSED;
+			}
+#  endif  /* __VOLUME_SCATTER__ */
+		}
+	}
+
+	return VOLUME_PATH_ATTENUATED;
+}
+#endif  /* __VOLUME__ */
+
+#endif /* __SPLIT_KERNEL__ */
+
+ccl_device_forceinline bool kernel_path_shader_apply(
+	KernelGlobals *kg,
+	ShaderData *sd,
+	ccl_addr_space PathState *state,
+	ccl_addr_space Ray *ray,
+	float3 throughput,
+	ShaderData *emission_sd,
+	PathRadiance *L,
+	ccl_global float *buffer)
+{
+#ifdef __SHADOW_TRICKS__
+	if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+		if(state->flag & PATH_RAY_CAMERA) {
+			state->flag |= (PATH_RAY_SHADOW_CATCHER |
+						   PATH_RAY_STORE_SHADOW_INFO);
+
+			float3 bg = make_float3(0.0f, 0.0f, 0.0f);
+			if(!kernel_data.background.transparent) {
+				bg = indirect_background(kg, emission_sd, state, ray);
+			}
+			path_radiance_accum_shadowcatcher(L, throughput, bg);
+		}
+	}
+	else if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		/* Only update transparency after shadow catcher bounce. */
+		L->shadow_transparency *=
+				average(shader_bsdf_transparency(kg, sd));
+	}
+#endif  /* __SHADOW_TRICKS__ */
+
+	/* holdout */
+#ifdef __HOLDOUT__
+	if(((sd->flag & SD_HOLDOUT) ||
+		(sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+	   (state->flag & PATH_RAY_CAMERA))
+	{
+		if(kernel_data.background.transparent) {
+			float3 holdout_weight;
+			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+				holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+			}
+			else {
+				holdout_weight = shader_holdout_eval(kg, sd);
+			}
+			/* any throughput is ok, should all be identical here */
+			L->transparent += average(holdout_weight*throughput);
+		}
+
+		if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
+			return false;
+		}
+	}
+#endif  /* __HOLDOUT__ */
+
+	/* holdout mask objects do not write data passes */
+	kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
+
+	/* blurring of bsdf after bounces, for rays that have a small likelihood
+	 * of following this particular path (diffuse, rough glossy) */
+	if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+		float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+
+		if(blur_pdf < 1.0f) {
+			float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+			shader_bsdf_blur(kg, sd, blur_roughness);
+		}
+	}
+
+#ifdef __EMISSION__
+	/* emission */
+	if(sd->flag & SD_EMISSION) {
+		float3 emission = indirect_primitive_emission(kg, sd, sd->ray_length, state->flag, state->ray_pdf);
+		path_radiance_accum_emission(L, state, throughput, emission);
+	}
+#endif  /* __EMISSION__ */
+
+	return true;
+}
 
 ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
                                         ShaderData *sd,
                                         ShaderData *emission_sd,
                                         PathRadiance *L,
                                         ccl_addr_space PathState *state,
-                                        RNG *rng,
                                         float3 throughput,
                                         float3 ao_alpha)
 {
 	/* todo: solve correlation */
 	float bsdf_u, bsdf_v;
 
-	path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 	float ao_factor = kernel_data.background.ao_factor;
 	float3 ao_N;
@@ -83,13 +366,11 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 		light_ray.P = ray_offset(sd->P, sd->Ng);
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
 		light_ray.time = sd->time;
-#endif  /* __OBJECT_MOTION__ */
 		light_ray.dP = sd->dP;
 		light_ray.dD = differential3_zero();
 
-		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
+		if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
 			path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
 		}
 		else {
@@ -100,265 +381,85 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 #ifndef __SPLIT_KERNEL__
 
+#if defined(__BRANCHED_PATH__) || defined(__BAKING__)
+
 ccl_device void kernel_path_indirect(KernelGlobals *kg,
                                      ShaderData *sd,
                                      ShaderData *emission_sd,
-                                     RNG *rng,
                                      Ray *ray,
                                      float3 throughput,
-                                     int num_samples,
                                      PathState *state,
                                      PathRadiance *L)
 {
 	/* path iteration */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, state);
-		if(state->bounce > kernel_data.integrator.ao_bounces) {
-			visibility = PATH_RAY_SHADOW;
-			ray->t = kernel_data.background.ao_distance;
-		}
-		bool hit = scene_intersect(kg,
-		                           *ray,
-		                           visibility,
-		                           &isect,
-		                           NULL,
-		                           0.0f, 0.0f);
+		bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
 
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray->P - state->ray_t*ray->D;
-			state->ray_t += isect.t;
-			light_ray.D = ray->D;
-			light_ray.t = state->ray_t;
-			light_ray.time = ray->time;
-			light_ray.dD = ray->dD;
-			light_ray.dP = ray->dP;
-
-			/* intersect with lamp */
-			float3 emission;
-			if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission)) {
-				path_radiance_accum_emission(L,
-				                             throughput,
-				                             emission,
-				                             state->bounce);
-			}
-		}
-#endif  /* __LAMP_MIS__ */
+		/* Find intersection with lamps and compute emission for MIS. */
+		kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L);
 
 #ifdef __VOLUME__
-		/* Sanitize volume stack. */
-		if(!hit) {
-			kernel_volume_clean_stack(kg, state->volume_stack);
-		}
-		/* volume attenuation, emission, scatter */
-		if(state->volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = *ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-
-			bool heterogeneous =
-			        volume_stack_is_heterogeneous(kg,
-			                                      state->volume_stack);
-
-#  ifdef __VOLUME_DECOUPLED__
-			int sampling_method =
-			        volume_stack_sampling_method(kg,
-			                                     state->volume_stack);
-			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, false, sampling_method);
-
-			if(decoupled) {
-				/* cache steps along volume for repeated sampling */
-				VolumeSegment volume_segment;
-
-				shader_setup_from_volume(kg,
-				                         sd,
-				                         &volume_ray);
-				kernel_volume_decoupled_record(kg,
-				                               state,
-				                               &volume_ray,
-				                               sd,
-				                               &volume_segment,
-				                               heterogeneous);
-
-				volume_segment.sampling_method = sampling_method;
-
-				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION) {
-					path_radiance_accum_emission(L,
-					                             throughput,
-					                             volume_segment.accum_emission,
-					                             state->bounce);
-				}
-
-				/* scattering */
-				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-				if(volume_segment.closure_flag & SD_SCATTER) {
-					int all = kernel_data.integrator.sample_all_lights_indirect;
-
-					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg,
-					                                          rng,
-					                                          sd,
-					                                          emission_sd,
-					                                          throughput,
-					                                          state,
-					                                          L,
-					                                          all,
-					                                          &volume_ray,
-					                                          &volume_segment);
-
-					/* indirect sample. if we use distance sampling and take just
-					 * one sample for direct and indirect light, we could share
-					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-
-					result = kernel_volume_decoupled_scatter(kg,
-					                                         state,
-					                                         &volume_ray,
-					                                         sd,
-					                                         &throughput,
-					                                         rphase,
-					                                         rscatter,
-					                                         &volume_segment,
-					                                         NULL,
-					                                         true);
-				}
-
-				/* free cached steps */
-				kernel_volume_decoupled_free(kg, &volume_segment);
-
-				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             sd,
-					                             &throughput,
-					                             state,
-					                             L,
-					                             ray))
-					{
-						continue;
-					}
-					else {
-						break;
-					}
-				}
-				else {
-					throughput *= volume_segment.accum_transmittance;
-				}
-			}
-			else
-#  endif  /* __VOLUME_DECOUPLED__ */
-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, state, sd, &volume_ray, L, &throughput, rng, heterogeneous);
-
-#  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg,
-					                                 rng,
-					                                 sd,
-					                                 emission_sd,
-					                                 throughput,
-					                                 state,
-					                                 L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
-					                             sd,
-					                             &throughput,
-					                             state,
-					                             L,
-					                             ray))
-					{
-						continue;
-					}
-					else {
-						break;
-					}
-				}
-#  endif  /* __VOLUME_SCATTER__ */
-			}
+		/* Volume integration. */
+		VolumeIntegrateResult result = kernel_path_volume(kg,
+		                                                   sd,
+		                                                   state,
+		                                                   ray,
+		                                                   &throughput,
+		                                                   &isect,
+		                                                   hit,
+		                                                   emission_sd,
+		                                                   L);
+
+		if(result == VOLUME_PATH_SCATTERED) {
+			continue;
+		}
+		else if(result == VOLUME_PATH_MISSED) {
+			break;
 		}
-#endif  /* __VOLUME__ */
+#endif /* __VOLUME__*/
 
+		/* Shade background. */
 		if(!hit) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, emission_sd, state, ray);
-			path_radiance_accum_background(L,
-			                               state,
-			                               throughput,
-			                               L_background);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, state, ray, throughput, emission_sd, L);
 			break;
 		}
-		else if(state->bounce > kernel_data.integrator.ao_bounces) {
+		else if(path_state_ao_bounce(kg, state)) {
 			break;
 		}
 
-		/* setup shading */
+		/* Setup and evaluate shader. */
 		shader_setup_from_ray(kg,
 		                      sd,
 		                      &isect,
 		                      ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_INDIRECT);
-#ifdef __BRANCHED_PATH__
-		shader_merge_closures(sd);
-#endif  /* __BRANCHED_PATH__ */
-
-#ifdef __SHADOW_TRICKS__
-		if(!(sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
-			state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
-		}
-#endif  /* __SHADOW_TRICKS__ */
-
-		/* blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy) */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, sd, blur_roughness);
-			}
-		}
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd->flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg,
-			                                              sd,
-			                                              isect.t,
-			                                              state->flag,
-			                                              state->ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state->bounce);
+		shader_eval_surface(kg, sd, state, state->flag);
+		shader_prepare_closures(sd, state);
+
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             NULL))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability =
-		        path_state_terminate_probability(kg,
-		                                         state,
-		                                         throughput*num_samples);
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
 
 			if(terminate >= probability)
 				break;
@@ -371,7 +472,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
-			kernel_path_ao(kg, sd, emission_sd, L, state, rng, throughput, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_path_ao(kg, sd, emission_sd, L, state, throughput, make_float3(0.0f, 0.0f, 0.0f));
 		}
 #endif  /* __AO__ */
 
@@ -379,22 +480,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		/* bssrdf scatter to a different location on the same object, replacing
 		 * the closures with a diffuse BSDF */
 		if(sd->flag & SD_BSSRDF) {
-			float bssrdf_probability;
-			ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+			float bssrdf_u, bssrdf_v;
+			path_state_rng_2D(kg,
+			                  state,
+			                  PRNG_BSDF_U,
+			                  &bssrdf_u, &bssrdf_v);
 
-			/* modify throughput for picking bssrdf or bsdf */
-			throughput *= bssrdf_probability;
+			const ShaderClosure *sc = shader_bssrdf_pick(sd, &throughput, &bssrdf_u);
 
 			/* do bssrdf scatter step if we picked a bssrdf closure */
 			if(sc) {
-				uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
-
-				float bssrdf_u, bssrdf_v;
-				path_state_rng_2D(kg,
-				                  rng,
-				                  state,
-				                  PRNG_BSDF_U,
-				                  &bssrdf_u, &bssrdf_v);
+				uint lcg_state = lcg_state_init(state, 0x68bc21eb);
+
 				subsurface_scatter_step(kg,
 				                        sd,
 				                        state,
@@ -412,7 +509,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			int all = (kernel_data.integrator.sample_all_lights_indirect) ||
 			          (state->flag & PATH_RAY_SHADOW_CATCHER);
 			kernel_branched_path_surface_connect_light(kg,
-			                                           rng,
 			                                           sd,
 			                                           emission_sd,
 			                                           state,
@@ -423,38 +519,24 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		}
 #endif  /* defined(__EMISSION__) */
 
-		if(!kernel_path_surface_bounce(kg, rng, sd, &throughput, state, L, ray))
+		if(!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
 			break;
 	}
 }
 
+#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
 
-ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
-                                              RNG *rng,
-                                              int sample,
-                                              Ray ray,
-                                              ccl_global float *buffer,
-                                              PathRadiance *L,
-                                              bool *is_shadow_catcher)
+ccl_device_forceinline void kernel_path_integrate(
+	KernelGlobals *kg,
+	PathState *state,
+	float3 throughput,
+	Ray *ray,
+	PathRadiance *L,
+	ccl_global float *buffer,
+	ShaderData *emission_sd)
 {
-	/* initialize */
-	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
-
-	path_radiance_init(L, kernel_data.film.use_light_pass);
-
-	/* shader data memory used for both volumes and surfaces, saves stack space */
+	/* Shader data memory used for both volumes and surfaces, saves stack space. */
 	ShaderData sd;
-	/* shader data used by emission, shadows, volume stacks */
-	ShaderData emission_sd;
-
-	PathState state;
-	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
-	DebugData debug_data;
-	debug_data_init(&debug_data);
-#endif  /* __KERNEL_DEBUG__ */
 
 #ifdef __SUBSURFACE__
 	SubsurfaceIndirectRays ss_indirect;
@@ -465,265 +547,82 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
 
 	/* path iteration */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {	
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d);
-		}
-
-		if(state.bounce > kernel_data.integrator.ao_bounces) {
-			visibility = PATH_RAY_SHADOW;
-			ray.t = kernel_data.background.ao_distance;
-		}
-
-		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif  /* __HAIR__ */
+		bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L);
 
-#ifdef __KERNEL_DEBUG__
-		if(state.flag & PATH_RAY_CAMERA) {
-			debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
-			debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
-			debug_data.num_bvh_intersections += isect.num_intersections;
-		}
-		debug_data.num_ray_bounces++;
-#endif  /* __KERNEL_DEBUG__ */
-
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray.P - state.ray_t*ray.D;
-			state.ray_t += isect.t;
-			light_ray.D = ray.D;
-			light_ray.t = state.ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
-
-			/* intersect with lamp */
-			float3 emission;
-
-			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
-				path_radiance_accum_emission(L, throughput, emission, state.bounce);
-		}
-#endif  /* __LAMP_MIS__ */
+		/* Find intersection with lamps and compute emission for MIS. */
+		kernel_path_lamp_emission(kg, state, ray, throughput, &isect, emission_sd, L);
 
 #ifdef __VOLUME__
-		/* Sanitize volume stack. */
-		if(!hit) {
-			kernel_volume_clean_stack(kg, state.volume_stack);
-		}
-		/* volume attenuation, emission, scatter */
-		if(state.volume_stack[0].shader != SHADER_NONE) {
-			Ray volume_ray = ray;
-			volume_ray.t = (hit)? isect.t: FLT_MAX;
-
-			bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
-
-#  ifdef __VOLUME_DECOUPLED__
-			int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
-			bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
-
-			if(decoupled) {
-				/* cache steps along volume for repeated sampling */
-				VolumeSegment volume_segment;
-
-				shader_setup_from_volume(kg, &sd, &volume_ray);
-				kernel_volume_decoupled_record(kg, &state,
-					&volume_ray, &sd, &volume_segment, heterogeneous);
-
-				volume_segment.sampling_method = sampling_method;
-
-				/* emission */
-				if(volume_segment.closure_flag & SD_EMISSION)
-					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
-
-				/* scattering */
-				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-				if(volume_segment.closure_flag & SD_SCATTER) {
-					int all = false;
-
-					/* direct light sampling */
-					kernel_branched_path_volume_connect_light(kg, rng, &sd,
-						&emission_sd, throughput, &state, L, all,
-						&volume_ray, &volume_segment);
-
-					/* indirect sample. if we use distance sampling and take just
-					 * one sample for direct and indirect light, we could share
-					 * this computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
-
-					result = kernel_volume_decoupled_scatter(kg,
-						&state, &volume_ray, &sd, &throughput,
-						rphase, rscatter, &volume_segment, NULL, true);
-				}
-
-				/* free cached steps */
-				kernel_volume_decoupled_free(kg, &volume_segment);
-
-				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
-						continue;
-					else
-						break;
-				}
-				else {
-					throughput *= volume_segment.accum_transmittance;
-				}
-			}
-			else
-#  endif  /* __VOLUME_DECOUPLED__ */
-			{
-				/* integrate along volume segment with distance sampling */
-				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
-
-#  ifdef __VOLUME_SCATTER__
-				if(result == VOLUME_PATH_SCATTERED) {
-					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
-
-					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
-						continue;
-					else
-						break;
-				}
-#  endif  /* __VOLUME_SCATTER__ */
-			}
+		/* Volume integration. */
+		VolumeIntegrateResult result = kernel_path_volume(kg,
+		                                                   &sd,
+		                                                   state,
+		                                                   ray,
+		                                                   &throughput,
+		                                                   &isect,
+		                                                   hit,
+		                                                   emission_sd,
+		                                                   L);
+
+		if(result == VOLUME_PATH_SCATTERED) {
+			continue;
+		}
+		else if(result == VOLUME_PATH_MISSED) {
+			break;
 		}
-#endif  /* __VOLUME__ */
+#endif /* __VOLUME__*/
 
+		/* Shade background. */
 		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif  /* __PASSES__ */
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(L, &state, throughput, L_background);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, state, ray, throughput, emission_sd, L);
 			break;
 		}
-		else if(state.bounce > kernel_data.integrator.ao_bounces) {
+		else if(path_state_ao_bounce(kg, state)) {
 			break;
 		}
 
-		/* setup shading */
-		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
-		shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
-
-#ifdef __SHADOW_TRICKS__
-		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
-			if(state.flag & PATH_RAY_CAMERA) {
-				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
-				state.catcher_object = sd.object;
-				if(!kernel_data.background.transparent) {
-					L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
-				}
-			}
-		}
-		else {
-			state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
-		}
-#endif  /* __SHADOW_TRICKS__ */
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(((sd.flag & SD_HOLDOUT) ||
-		    (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
-		   (state.flag & PATH_RAY_CAMERA))
+		/* Setup and evaluate shader. */
+		shader_setup_from_ray(kg, &sd, &isect, ray);
+		shader_eval_surface(kg, &sd, state, state->flag);
+		shader_prepare_closures(&sd, state);
+
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             &sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             buffer))
 		{
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				}
-				else {
-					holdout_weight = shader_holdout_eval(kg, &sd);
-				}
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-
-			if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				break;
-			}
-		}
-#endif  /* __HOLDOUT__ */
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
-
-		/* blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy) */
-		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
-			float blur_pdf = kernel_data.integrator.filter_glossy*state.min_ray_pdf;
-
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, &sd, blur_roughness);
-			}
-		}
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			/* todo: is isect.t wrong here for transparent surfaces? */
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state.bounce);
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate */
-		float probability = path_state_terminate_probability(kg, &state, throughput);
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
 			break;
 		}
 		else if(probability != 1.0f) {
-			float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
 			if(terminate >= probability)
 				break;
 
 			throughput /= probability;
 		}
 
-		kernel_update_denoising_features(kg, &sd, &state, L);
+		kernel_update_denoising_features(kg, &sd, state, L);
 
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+			kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
 		}
 #endif  /* __AO__ */
 
@@ -733,11 +632,10 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
 		if(sd.flag & SD_BSSRDF) {
 			if(kernel_path_subsurface_scatter(kg,
 			                                  &sd,
-			                                  &emission_sd,
+			                                  emission_sd,
 			                                  L,
-			                                  &state,
-			                                  rng,
-			                                  &ray,
+			                                  state,
+			                                  ray,
 			                                  &throughput,
 			                                  &ss_indirect))
 			{
@@ -747,24 +645,22 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */
 
 		/* direct lighting */
-		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
+		kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
 
 		/* compute direct lighting and next bounce */
-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
+		if(!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
 			break;
 	}
 
 #ifdef __SUBSURFACE__
-		kernel_path_subsurface_accum_indirect(&ss_indirect, L);
-
 		/* Trace indirect subsurface rays by restarting the loop. this uses less
 		 * stack memory than invoking kernel_path_indirect.
 		 */
 		if(ss_indirect.num_rays) {
 			kernel_path_subsurface_setup_indirect(kg,
 			                                      &ss_indirect,
-			                                      &state,
-			                                      &ray,
+			                                      state,
+			                                      ray,
 			                                      L,
 			                                      &throughput);
 		}
@@ -773,16 +669,6 @@ ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
 		}
 	}
 #endif  /* __SUBSURFACE__ */
-
-#ifdef __SHADOW_TRICKS__
-	*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
-#endif  /* __SHADOW_TRICKS__ */
-
-#ifdef __KERNEL_DEBUG__
-	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif  /* __KERNEL_DEBUG__ */
-
-	return 1.0f - L_transparent;
 }
 
 ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -796,25 +682,37 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	rng_state += index;
 	buffer += index*pass_stride;
 
-	/* initialize random numbers and ray */
-	RNG rng;
+	/* Initialize random numbers and sample ray. */
+	uint rng_hash;
 	Ray ray;
 
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
-
-	/* integrate */
-	PathRadiance L;
-	bool is_shadow_catcher;
+	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray);
 
-	if(ray.t != 0.0f) {
-		float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
-		kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
-	}
-	else {
-		kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+	if(ray.t == 0.0f) {
+		kernel_write_result(kg, buffer, sample, NULL);
+		return;
 	}
 
-	path_rng_end(kg, rng_state, rng);
+	/* Initialize state. */
+	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+
+	PathRadiance L;
+	path_radiance_init(&L, kernel_data.film.use_light_pass);
+
+	ShaderData emission_sd;
+	PathState state;
+	path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
+
+	/* Integrate. */
+	kernel_path_integrate(kg,
+	                      &state,
+	                      throughput,
+	                      &ray,
+	                      &L,
+	                      buffer,
+	                      &emission_sd);
+
+	kernel_write_result(kg, buffer, sample, &L);
 }
 
 #endif  /* __SPLIT_KERNEL__ */
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 10816d3e5d1..6e0ec22d581 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -23,7 +23,6 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
                                                ShaderData *emission_sd,
                                                PathRadiance *L,
                                                ccl_addr_space PathState *state,
-                                               RNG *rng,
                                                float3 throughput)
 {
 	int num_samples = kernel_data.integrator.ao_samples;
@@ -35,7 +34,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 	for(int j = 0; j < num_samples; j++) {
 		float bsdf_u, bsdf_v;
-		path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+		path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 		float3 ao_D;
 		float ao_pdf;
@@ -49,13 +48,11 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 			light_ray.P = ray_offset(sd->P, sd->Ng);
 			light_ray.D = ao_D;
 			light_ray.t = kernel_data.background.ao_distance;
-#ifdef __OBJECT_MOTION__
 			light_ray.time = sd->time;
-#endif  /* __OBJECT_MOTION__ */
 			light_ray.dP = sd->dP;
 			light_ray.dD = differential3_zero();
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
 				path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
 			}
 			else {
@@ -69,7 +66,7 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 /* bounce off surface and integrate indirect light */
 ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
+	ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
 	float sum_sample_weight = 0.0f;
@@ -113,35 +110,38 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 		num_samples = ceil_to_int(num_samples_adjust*num_samples);
 
 		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(*rng, i);
 
 		for(int j = 0; j < num_samples; j++) {
 			PathState ps = *state;
 			float3 tp = throughput;
 			Ray bsdf_ray;
+#ifdef __SHADOW_TRICKS__
+			float shadow_transparency = L->shadow_transparency;
+#endif
+
+			ps.rng_hash = cmj_hash(state->rng_hash, i);
 
 			if(!kernel_branched_path_surface_bounce(kg,
-			                                        &bsdf_rng,
 			                                        sd,
 			                                        sc,
 			                                        j,
 			                                        num_samples,
 			                                        &tp,
 			                                        &ps,
-			                                        L,
+			                                        &L->state,
 			                                        &bsdf_ray,
 			                                        sum_sample_weight))
 			{
 				continue;
 			}
 
+			ps.rng_hash = state->rng_hash;
+
 			kernel_path_indirect(kg,
 			                     indirect_sd,
 			                     emission_sd,
-			                     rng,
 			                     &bsdf_ray,
 			                     tp*num_samples_inv,
-			                     num_samples,
 			                     &ps,
 			                     L);
 
@@ -149,6 +149,10 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			 * for the next samples */
 			path_radiance_sum_indirect(L);
 			path_radiance_reset_indirect(L);
+
+#ifdef __SHADOW_TRICKS__
+			L->shadow_transparency = shadow_transparency;
+#endif
 		}
 	}
 }
@@ -160,7 +164,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                         ShaderData *emission_sd,
                                                         PathRadiance *L,
                                                         PathState *state,
-                                                        RNG *rng,
                                                         Ray *ray,
                                                         float3 throughput)
 {
@@ -171,17 +174,17 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 			continue;
 
 		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
+		uint lcg_state = lcg_state_init(state, 0x68bc21eb);
 		int num_samples = kernel_data.integrator.subsurface_samples;
 		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(*rng, i);
+		uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
 
 		/* do subsurface scatter step with copy of shader data, this will
 		 * replace the BSSRDF with a diffuse BSDF closure */
 		for(int j = 0; j < num_samples; j++) {
 			SubsurfaceIntersection ss_isect;
 			float bssrdf_u, bssrdf_v;
-			path_branched_rng_2D(kg, &bssrdf_rng, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+			path_branched_rng_2D(kg, bssrdf_rng_hash, state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 			int num_hits = subsurface_scatter_multi_intersect(kg,
 			                                                  &ss_isect,
 			                                                  sd,
@@ -234,7 +237,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 					          (state->flag & PATH_RAY_SHADOW_CATCHER);
 					kernel_branched_path_surface_connect_light(
 					        kg,
-					        rng,
 					        &bssrdf_sd,
 					        emission_sd,
 					        &hit_state,
@@ -248,7 +250,6 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 				/* indirect light */
 				kernel_branched_path_surface_indirect_light(
 				        kg,
-				        rng,
 				        &bssrdf_sd,
 				        indirect_sd,
 				        emission_sd,
@@ -262,17 +263,15 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 }
 #endif  /* __SUBSURFACE__ */
 
-ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
-                                                RNG *rng,
-                                                int sample,
-                                                Ray ray,
-                                                ccl_global float *buffer,
-                                                PathRadiance *L,
-                                                bool *is_shadow_catcher)
+ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
+                                               uint rng_hash,
+                                               int sample,
+                                               Ray ray,
+                                               ccl_global float *buffer,
+                                               PathRadiance *L)
 {
 	/* initialize */
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-	float L_transparent = 0.0f;
 
 	path_radiance_init(L, kernel_data.film.use_light_pass);
 
@@ -282,48 +281,16 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 	ShaderData emission_sd, indirect_sd;
 
 	PathState state;
-	path_state_init(kg, &emission_sd, &state, rng, sample, &ray);
-
-#ifdef __KERNEL_DEBUG__
-	DebugData debug_data;
-	debug_data_init(&debug_data);
-#endif  /* __KERNEL_DEBUG__ */
+	path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
 
 	/* Main Loop
 	 * Here we only handle transparency intersections from the camera ray.
 	 * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
 	 */
 	for(;;) {
-		/* intersect scene */
+		/* Find intersection with objects in scene. */
 		Intersection isect;
-		uint visibility = path_state_ray_visibility(kg, &state);
-
-#ifdef __HAIR__
-		float difl = 0.0f, extmax = 0.0f;
-		uint lcg_state = 0;
-
-		if(kernel_data.bvh.have_curves) {
-			if(kernel_data.cam.resolution == 1) {
-				float3 pixdiff = ray.dD.dx + ray.dD.dy;
-				/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-				difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-			}
-
-			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d);
-		}
-
-		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-		bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif  /* __HAIR__ */
-
-#ifdef __KERNEL_DEBUG__
-		debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
-		debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
-		debug_data.num_bvh_intersections += isect.num_intersections;
-		debug_data.num_ray_bounces++;
-#endif  /* __KERNEL_DEBUG__ */
+		bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L);
 
 #ifdef __VOLUME__
 		/* Sanitize volume stack. */
@@ -353,7 +320,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 
 				int all = kernel_data.integrator.sample_all_lights_direct;
 
-				kernel_branched_path_volume_connect_light(kg, rng, &sd,
+				kernel_branched_path_volume_connect_light(kg, &sd,
 					&emission_sd, throughput, &state, L, all,
 					&volume_ray, &volume_segment);
 
@@ -372,30 +339,25 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 					/* scatter sample. if we use distance sampling and take just one
 					 * sample for direct and indirect light, we could share this
 					 * computation, but makes code a bit complex */
-					float rphase = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_PHASE);
-					float rscatter = path_state_rng_1D_for_decision(kg, rng, &ps, PRNG_SCATTER_DISTANCE);
+					float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
+					float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
 
 					VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 						&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
 
-					(void)result;
-					kernel_assert(result == VOLUME_PATH_SCATTERED);
-
-					if(kernel_path_volume_bounce(kg,
-					                             rng,
+					if(result == VOLUME_PATH_SCATTERED &&
+					   kernel_path_volume_bounce(kg,
 					                             &sd,
 					                             &tp,
 					                             &ps,
-					                             L,
+					                             &L->state,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
 						                     &indirect_sd,
 						                     &emission_sd,
-						                     rng,
 						                     &pray,
 						                     tp*num_samples_inv,
-						                     num_samples,
 						                     &ps,
 						                     L);
 
@@ -409,7 +371,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 
 			/* emission and transmittance */
 			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+				path_radiance_accum_emission(L, &state, throughput, volume_segment.accum_emission);
 			throughput *= volume_segment.accum_transmittance;
 
 			/* free cached steps */
@@ -431,29 +393,26 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 				path_state_branch(&ps, j, num_samples);
 
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
+					kg, &ps, &sd, &volume_ray, L, &tp, heterogeneous);
 
 #ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* todo: support equiangular, MIS and all light sampling.
 					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
+					kernel_path_volume_connect_light(kg, &sd, &emission_sd, tp, &state, L);
 
 					if(kernel_path_volume_bounce(kg,
-					                             rng,
 					                             &sd,
 					                             &tp,
 					                             &ps,
-					                             L,
+					                             &L->state,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
 						                     &indirect_sd,
 						                     &emission_sd,
-						                     rng,
 						                     &pray,
 						                     tp,
-						                     num_samples,
 						                     &ps,
 						                     L);
 
@@ -472,89 +431,42 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 		}
 #endif  /* __VOLUME__ */
 
+		/* Shade background. */
 		if(!hit) {
-			/* eval background shader if nothing hit */
-			if(kernel_data.background.transparent) {
-				L_transparent += average(throughput);
-
-#ifdef __PASSES__
-				if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif  /* __PASSES__ */
-					break;
-			}
-
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(L, &state, throughput, L_background);
-#endif  /* __BACKGROUND__ */
-
+			kernel_path_background(kg, &state, &ray, throughput, &emission_sd, L);
 			break;
 		}
 
-		/* setup shading */
+		/* Setup and evaluate shader. */
 		shader_setup_from_ray(kg, &sd, &isect, &ray);
-		shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, &sd, &state, state.flag);
 		shader_merge_closures(&sd);
 
-#ifdef __SHADOW_TRICKS__
-		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
-			if(state.flag & PATH_RAY_CAMERA) {
-				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
-				state.catcher_object = sd.object;
-				if(!kernel_data.background.transparent) {
-					L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
-				}
-			}
-		}
-		else {
-			state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
-		}
-#endif  /* __SHADOW_TRICKS__ */
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) {
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				}
-				else {
-					holdout_weight = shader_holdout_eval(kg, &sd);
-				}
-				/* any throughput is ok, should all be identical here */
-				L_transparent += average(holdout_weight*throughput);
-			}
-			if(sd.object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				break;
-			}
-		}
-#endif  /* __HOLDOUT__ */
-
-		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd.flag & SD_EMISSION) {
-			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state.bounce);
+		/* Apply shadow catcher, holdout, emission. */
+		if(!kernel_path_shader_apply(kg,
+		                             &sd,
+		                             &state,
+		                             &ray,
+		                             throughput,
+		                             &emission_sd,
+		                             L,
+		                             buffer))
+		{
+			break;
 		}
-#endif  /* __EMISSION__ */
 
 		/* transparency termination */
 		if(state.flag & PATH_RAY_TRANSPARENT) {
 			/* path termination. this is a strange place to put the termination, it's
 			 * mainly due to the mixed in MIS that we use. gives too many unneeded
 			 * shader evaluations, only need emission if we are going to terminate */
-			float probability = path_state_terminate_probability(kg, &state, throughput);
+			float probability = path_state_continuation_probability(kg, &state, throughput);
 
 			if(probability == 0.0f) {
 				break;
 			}
 			else if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_TERMINATE);
+				float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
 
 				if(terminate >= probability)
 					break;
@@ -568,7 +480,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
+			kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, throughput);
 		}
 #endif  /* __AO__ */
 
@@ -576,7 +488,7 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
 			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
-			                                        L, &state, rng, &ray, throughput);
+			                                        L, &state, &ray, throughput);
 		}
 #endif  /* __SUBSURFACE__ */
 
@@ -588,13 +500,13 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 			if(kernel_data.integrator.use_direct_light) {
 				int all = (kernel_data.integrator.sample_all_lights_direct) ||
 				          (state.flag & PATH_RAY_SHADOW_CATCHER);
-				kernel_branched_path_surface_connect_light(kg, rng,
+				kernel_branched_path_surface_connect_light(kg,
 					&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
 			}
 #endif  /* __EMISSION__ */
 
 			/* indirect light */
-			kernel_branched_path_surface_indirect_light(kg, rng,
+			kernel_branched_path_surface_indirect_light(kg,
 				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
 
 			/* continue in case of transparency */
@@ -623,16 +535,6 @@ ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
 		kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
 #endif  /* __VOLUME__ */
 	}
-
-#ifdef __SHADOW_TRICKS__
-	*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
-#endif  /* __SHADOW_TRICKS__ */
-
-#ifdef __KERNEL_DEBUG__
-	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
-#endif  /* __KERNEL_DEBUG__ */
-
-	return 1.0f - L_transparent;
 }
 
 ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -647,24 +549,21 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 	buffer += index*pass_stride;
 
 	/* initialize random numbers and ray */
-	RNG rng;
+	uint rng_hash;
 	Ray ray;
 
-	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
+	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, &ray);
 
 	/* integrate */
 	PathRadiance L;
-	bool is_shadow_catcher;
 
 	if(ray.t != 0.0f) {
-		float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
-		kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+		kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
+		kernel_write_result(kg, buffer, sample, &L);
 	}
 	else {
-		kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+		kernel_write_result(kg, buffer, sample, NULL);
 	}
-
-	path_rng_end(kg, rng_state, rng);
 }
 
 #endif  /* __SPLIT_KERNEL__ */
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
index 82f83deb595..54dd278a185 100644
--- a/intern/cycles/kernel/kernel_path_common.h
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
                                                ccl_global uint *rng_state,
                                                int sample,
                                                int x, int y,
-                                               RNG *rng,
+                                               uint *rng_hash,
                                                ccl_addr_space Ray *ray)
 {
 	float filter_u;
@@ -34,20 +34,20 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
 		*rng_state = hash_int_2d(x, y);
 	}
 
-	path_rng_init(kg, rng_state, sample, num_samples, rng, x, y, &filter_u, &filter_v);
+	path_rng_init(kg, rng_state, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
 
 	/* sample camera ray */
 
 	float lens_u = 0.0f, lens_v = 0.0f;
 
 	if(kernel_data.cam.aperturesize > 0.0f)
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
+		path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
 
 	float time = 0.0f;
 
 #ifdef __CAMERA_MOTION__
 	if(kernel_data.cam.shuttertime != -1.0f)
-		time = path_rng_1D(kg, rng, sample, num_samples, PRNG_TIME);
+		time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
 #endif
 
 	camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 5d92fd12201..eccee54c0e3 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -19,15 +19,17 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init(KernelGlobals *kg,
                                        ShaderData *stack_sd,
                                        ccl_addr_space PathState *state,
-                                       RNG *rng,
+                                       uint rng_hash,
                                        int sample,
                                        ccl_addr_space Ray *ray)
 {
 	state->flag = PATH_RAY_CAMERA|PATH_RAY_MIS_SKIP;
 
+	state->rng_hash = rng_hash;
 	state->rng_offset = PRNG_BASE_NUM;
 	state->sample = sample;
 	state->num_samples = kernel_data.integrator.aa_samples;
+	state->branch_factor = 1.0f;
 
 	state->bounce = 0;
 	state->diffuse_bounce = 0;
@@ -58,16 +60,12 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 		/* Initialize volume stack with volume we are inside of. */
 		kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
 		/* Seed RNG for cases where we can't use stratified samples .*/
-		state->rng_congruential = lcg_init(*rng + sample*0x51633e2d);
+		state->rng_congruential = lcg_init(rng_hash + sample*0x51633e2d);
 	}
 	else {
 		state->volume_stack[0].shader = SHADER_NONE;
 	}
 #endif
-
-#ifdef __SHADOW_TRICKS__
-	state->catcher_object = OBJECT_NONE;
-#endif
 }
 
 ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
@@ -78,12 +76,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 		state->flag |= PATH_RAY_TRANSPARENT;
 		state->transparent_bounce++;
 
-		/* don't increase random number generator offset here, to avoid some
-		 * unwanted patterns, see path_state_rng_1D_for_decision */
-
 		if(!kernel_data.integrator.transparent_shadows)
 			state->flag |= PATH_RAY_MIS_SKIP;
 
+		/* random number generator next bounce */
+		state->rng_offset += PRNG_BOUNCE_NUM;
+
 		return;
 	}
 
@@ -146,7 +144,7 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
 #endif
 }
 
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
+ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, ccl_addr_space PathState *state)
 {
 	uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
 
@@ -160,17 +158,28 @@ ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *s
 	return flag;
 }
 
-ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_addr_space PathState *state, const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
+                                                            ccl_addr_space PathState *state,
+                                                            const float3 throughput)
 {
 	if(state->flag & PATH_RAY_TRANSPARENT) {
-		/* transparent rays treated separately */
-		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
+		/* Transparent rays are treated separately with own max bounces. */
+		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
 			return 0.0f;
-		else if(state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce)
+		}
+		/* Do at least one bounce without RR. */
+		else if(state->transparent_bounce <= 1) {
 			return 1.0f;
+		}
+#ifdef __SHADOW_TRICKS__
+		/* Exception for shadow catcher not working correctly with RR. */
+		else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+			return 1.0f;
+		}
+#endif
 	}
 	else {
-		/* other rays */
+		/* Test max bounces for various ray types. */
 		if((state->bounce >= kernel_data.integrator.max_bounce) ||
 		   (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) ||
 		   (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) ||
@@ -181,13 +190,21 @@ ccl_device_inline float path_state_terminate_probability(KernelGlobals *kg, ccl_
 		{
 			return 0.0f;
 		}
-		else if(state->bounce <= kernel_data.integrator.min_bounce) {
+		/* Do at least one bounce without RR. */
+		else if(state->bounce <= 1) {
 			return 1.0f;
 		}
+#ifdef __SHADOW_TRICKS__
+		/* Exception for shadow catcher not working correctly with RR. */
+		else if((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
+			return 1.0f;
+		}
+#endif
 	}
 
-	/* probalistic termination */
-	return average(throughput); /* todo: try using max here */
+	/* Probalistic termination: use sqrt() to roughly match typical view
+	 * transform and do path termination a bit later on average. */
+	return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
 }
 
 /* TODO(DingTo): Find more meaningful name for this */
@@ -200,5 +217,30 @@ ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state,
 		state->bounce -= 1;
 }
 
+ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
+{
+    if(state->bounce <= kernel_data.integrator.ao_bounces) {
+        return false;
+    }
+
+    int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+    return (bounce > kernel_data.integrator.ao_bounces);
+}
+
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
+                                         int branch,
+                                         int num_branches)
+{
+	state->rng_offset += PRNG_BOUNCE_NUM;
+
+	if(num_branches > 1) {
+		/* Path is splitting into a branch, adjust so that each branch
+		 * still gets a unique sample from the same sequence. */
+		state->sample = state->sample*num_branches + branch;
+		state->num_samples = state->num_samples*num_branches;
+		state->branch_factor *= num_branches;
+	}
+}
+
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
index 10b568ac3dd..1436e8e5a5b 100644
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -28,16 +28,14 @@ bool kernel_path_subsurface_scatter(
         ShaderData *emission_sd,
         PathRadiance *L,
         ccl_addr_space PathState *state,
-        RNG *rng,
         ccl_addr_space Ray *ray,
         ccl_addr_space float3 *throughput,
         ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
 {
-	float bssrdf_probability;
-	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+	float bssrdf_u, bssrdf_v;
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 
-	/* modify throughput for picking bssrdf or bsdf */
-	*throughput *= bssrdf_probability;
+	const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
 
 	/* do bssrdf scatter step if we picked a bssrdf closure */
 	if(sc) {
@@ -45,13 +43,11 @@ bool kernel_path_subsurface_scatter(
 		 * the second one should be converted to a diffuse BSDF to
 		 * avoid this.
 		 */
-		kernel_assert(!ss_indirect->tracing);
+		kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
 
-		uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
+		uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
 
 		SubsurfaceIntersection ss_isect;
-		float bssrdf_u, bssrdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
 		int num_hits = subsurface_scatter_multi_intersect(kg,
 		                                                  &ss_isect,
 		                                                  sd,
@@ -60,7 +56,7 @@ bool kernel_path_subsurface_scatter(
 		                                                  bssrdf_u, bssrdf_v,
 		                                                  false);
 #  ifdef __VOLUME__
-		ss_indirect->need_update_volume_stack =
+		bool need_update_volume_stack =
 		        kernel_data.integrator.use_volumes &&
 		        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
 #  endif  /* __VOLUME__ */
@@ -79,29 +75,25 @@ bool kernel_path_subsurface_scatter(
 			                               sc,
 			                               false);
 
+			kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
+
 			ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
 			ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
 			ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-			PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays];
+			PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
 
 			*hit_state = *state;
 			*hit_ray = *ray;
 			*hit_tp = *throughput;
+			*hit_L_state = L->state;
 
 			hit_state->rng_offset += PRNG_BOUNCE_NUM;
 
-			path_radiance_init(hit_L, kernel_data.film.use_light_pass);
-			hit_L->direct_throughput = L->direct_throughput;
-			path_radiance_copy_indirect(hit_L, L);
-
-			kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
-
 			if(kernel_path_surface_bounce(kg,
-			                              rng,
 			                              sd,
 			                              hit_tp,
 			                              hit_state,
-			                              hit_L,
+			                              hit_L_state,
 			                              hit_ray))
 			{
 #  ifdef __LAMP_MIS__
@@ -109,7 +101,7 @@ bool kernel_path_subsurface_scatter(
 #  endif  /* __LAMP_MIS__ */
 
 #  ifdef __VOLUME__
-				if(ss_indirect->need_update_volume_stack) {
+				if(need_update_volume_stack) {
 					Ray volume_ray = *ray;
 					/* Setup ray from previous surface point to the new one. */
 					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
@@ -122,12 +114,8 @@ bool kernel_path_subsurface_scatter(
 					    hit_state->volume_stack);
 				}
 #  endif  /* __VOLUME__ */
-				path_radiance_reset_indirect(L);
 				ss_indirect->num_rays++;
 			}
-			else {
-				path_radiance_accum_sample(L, hit_L, 1);
-			}
 		}
 		return true;
 	}
@@ -137,23 +125,9 @@ bool kernel_path_subsurface_scatter(
 ccl_device_inline void kernel_path_subsurface_init_indirect(
         ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
 {
-	ss_indirect->tracing = false;
 	ss_indirect->num_rays = 0;
 }
 
-ccl_device void kernel_path_subsurface_accum_indirect(
-        ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
-        PathRadiance *L)
-{
-	if(ss_indirect->tracing) {
-		path_radiance_sum_indirect(L);
-		path_radiance_accum_sample(&ss_indirect->direct_L, L, 1);
-		if(ss_indirect->num_rays == 0) {
-			*L = ss_indirect->direct_L;
-		}
-	}
-}
-
 ccl_device void kernel_path_subsurface_setup_indirect(
         KernelGlobals *kg,
         ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
@@ -162,20 +136,15 @@ ccl_device void kernel_path_subsurface_setup_indirect(
         PathRadiance *L,
         ccl_addr_space float3 *throughput)
 {
-	if(!ss_indirect->tracing) {
-		ss_indirect->direct_L = *L;
-	}
-	ss_indirect->tracing = true;
-
 	/* Setup state, ray and throughput for indirect SSS rays. */
 	ss_indirect->num_rays--;
 
-	ccl_addr_space Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
-	PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays];
+	path_radiance_sum_indirect(L);
+	path_radiance_reset_indirect(L);
 
 	*state = ss_indirect->state[ss_indirect->num_rays];
-	*ray = *indirect_ray;
-	*L = *indirect_L;
+	*ray = ss_indirect->rays[ss_indirect->num_rays];
+	L->state = ss_indirect->L_state[ss_indirect->num_rays];
 	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
 
 	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index dcb577e176f..7b566b01b04 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -20,7 +20,6 @@ CCL_NAMESPACE_BEGIN
 /* branched path tracing: connect path directly to position on one or more lights and add it to L */
 ccl_device_noinline void kernel_branched_path_surface_connect_light(
         KernelGlobals *kg,
-        RNG *rng,
         ShaderData *sd,
         ShaderData *emission_sd,
         ccl_addr_space PathState *state,
@@ -50,12 +49,12 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 
 			int num_samples = ceil_to_int(num_samples_adjust*light_select_num_samples(kg, i));
 			float num_samples_inv = num_samples_adjust/(num_samples*kernel_data.integrator.num_all_lights);
-			RNG lamp_rng = cmj_hash(*rng, i);
+			uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
 
 			for(int j = 0; j < num_samples; j++) {
 				float light_u, light_v;
-				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-				float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
+				path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, lamp_rng_hash, state, j, num_samples);
 
 				LightSample ls;
 				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
@@ -68,7 +67,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
 							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
@@ -86,17 +85,16 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 			float num_samples_inv = num_samples_adjust/num_samples;
 
 			for(int j = 0; j < num_samples; j++) {
-				float light_t = path_branched_rng_1D(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
-				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-				float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+				path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
-					light_t = 0.5f*light_t;
+					light_u = 0.5f*light_u;
 
 				LightSample ls;
-				if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+				if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
@@ -105,7 +103,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
 							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
@@ -119,19 +117,18 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 	}
 	else {
 		/* sample one light at random */
-		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
-		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+		path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+		float terminate = path_state_rng_light_termination(kg, state);
 
 		LightSample ls;
-		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+		if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
 				float3 shadow;
 
-				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+				if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
 					path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
 				}
@@ -147,14 +144,13 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
 /* branched path tracing: bounce off or through surface to with new direction stored in ray */
 ccl_device bool kernel_branched_path_surface_bounce(
         KernelGlobals *kg,
-        RNG *rng,
         ShaderData *sd,
         const ShaderClosure *sc,
         int sample,
         int num_samples,
         ccl_addr_space float3 *throughput,
         ccl_addr_space PathState *state,
-        PathRadiance *L,
+        PathRadianceState *L_state,
         ccl_addr_space Ray *ray,
         float sum_sample_weight)
 {
@@ -164,7 +160,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
 	float3 bsdf_omega_in;
 	differential3 bsdf_domega_in;
 	float bsdf_u, bsdf_v;
-	path_branched_rng_2D(kg, rng, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+	path_branched_rng_2D(kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 	int label;
 
 	label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval,
@@ -174,7 +170,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
 		return false;
 
 	/* modify throughput */
-	path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+	path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
 
 #ifdef __DENOISING_FEATURES__
 	state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
@@ -217,7 +213,7 @@ ccl_device bool kernel_branched_path_surface_bounce(
 #endif
 
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
 	ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
 	PathRadiance *L)
 {
@@ -228,7 +224,6 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 #ifdef __SHADOW_TRICKS__
 	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
 		kernel_branched_path_surface_connect_light(kg,
-		                                           rng,
 		                                           sd,
 		                                           emission_sd,
 		                                           state,
@@ -241,9 +236,8 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 #endif
 
 	/* sample illumination from lights to find path contribution */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+	path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 	Ray light_ray;
 	BsdfEval L_light;
@@ -254,13 +248,13 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 #endif
 
 	LightSample ls;
-	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+	if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+		float terminate = path_state_rng_light_termination(kg, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
 			float3 shadow;
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
 				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
@@ -274,11 +268,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
 ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           RNG *rng,
                                            ShaderData *sd,
                                            ccl_addr_space float3 *throughput,
                                            ccl_addr_space PathState *state,
-                                           PathRadiance *L,
+                                           PathRadianceState *L_state,
                                            ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
@@ -289,7 +282,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		float3 bsdf_omega_in;
 		differential3 bsdf_domega_in;
 		float bsdf_u, bsdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+		path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 		int label;
 
 		label = shader_bsdf_sample(kg, sd, bsdf_u, bsdf_v, &bsdf_eval,
@@ -299,7 +292,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 			return false;
 
 		/* modify throughput */
-		path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
+		path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
 
 		/* set labels */
 		if(!(label & LABEL_TRANSPARENT)) {
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index dcedf51e479..b6a856baf24 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -20,7 +20,6 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline void kernel_path_volume_connect_light(
         KernelGlobals *kg,
-        RNG *rng,
         ShaderData *sd,
         ShaderData *emission_sd,
         float3 throughput,
@@ -32,9 +31,8 @@ ccl_device_inline void kernel_path_volume_connect_light(
 		return;
 
 	/* sample illumination from lights to find path contribution */
-	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 	float light_u, light_v;
-	path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+	path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 	Ray light_ray;
 	BsdfEval L_light;
@@ -42,18 +40,16 @@ ccl_device_inline void kernel_path_volume_connect_light(
 	bool is_lamp;
 
 	/* connect to light from given point where shader has been evaluated */
-#  ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
-#  endif
 
-	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
+	if(light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls))
 	{
-		float terminate = path_state_rng_light_termination(kg, rng, state);
+		float terminate = path_state_rng_light_termination(kg, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
 			float3 shadow;
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+			if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
 				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
@@ -69,11 +65,10 @@ ccl_device
 #endif
 bool kernel_path_volume_bounce(
     KernelGlobals *kg,
-    RNG *rng,
     ShaderData *sd,
     ccl_addr_space float3 *throughput,
     ccl_addr_space PathState *state,
-    PathRadiance *L,
+    PathRadianceState *L_state,
     ccl_addr_space Ray *ray)
 {
 	/* sample phase function */
@@ -82,7 +77,7 @@ bool kernel_path_volume_bounce(
 	float3 phase_omega_in;
 	differential3 phase_domega_in;
 	float phase_u, phase_v;
-	path_state_rng_2D(kg, rng, state, PRNG_PHASE_U, &phase_u, &phase_v);
+	path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
 	int label;
 
 	label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
@@ -92,7 +87,7 @@ bool kernel_path_volume_bounce(
 		return false;
 	
 	/* modify throughput */
-	path_radiance_bsdf_bounce(L, throughput, &phase_eval, phase_pdf, state->bounce, label);
+	path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
 
 	/* set labels */
 	state->ray_pdf = phase_pdf;
@@ -120,7 +115,6 @@ bool kernel_path_volume_bounce(
 #ifndef __SPLIT_KERNEL__
 ccl_device void kernel_branched_path_volume_connect_light(
         KernelGlobals *kg,
-        RNG *rng,
         ShaderData *sd,
         ShaderData *emission_sd,
         float3 throughput,
@@ -138,9 +132,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
 	BsdfEval L_light;
 	bool is_lamp;
 
-#  ifdef __OBJECT_MOTION__
 	light_ray.time = sd->time;
-#  endif
 
 	if(sample_all_lights) {
 		/* lamp sampling */
@@ -150,12 +142,12 @@ ccl_device void kernel_branched_path_volume_connect_light(
 
 			int num_samples = light_select_num_samples(kg, i);
 			float num_samples_inv = 1.0f/(num_samples*kernel_data.integrator.num_all_lights);
-			RNG lamp_rng = cmj_hash(*rng, i);
+			uint lamp_rng_hash = cmj_hash(state->rng_hash, i);
 
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on given light */
 				float light_u, light_v;
-				path_branched_rng_2D(kg, &lamp_rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				path_branched_rng_2D(kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				LightSample ls;
 				lamp_light_sample(kg, i, light_u, light_v, ray->P, &ls);
@@ -163,26 +155,24 @@ ccl_device void kernel_branched_path_volume_connect_light(
 				float3 tp = throughput;
 
 				/* sample position on volume segment */
-				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
-				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+				float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+				float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 
 				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 
-				(void)result;
-				kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
+				if(result == VOLUME_PATH_SCATTERED &&
+				   lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
 						ls.pdf *= 2.0f;
 
-					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
 							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
@@ -198,40 +188,37 @@ ccl_device void kernel_branched_path_volume_connect_light(
 
 			for(int j = 0; j < num_samples; j++) {
 				/* sample random position on random triangle */
-				float light_t = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_LIGHT);
 				float light_u, light_v;
-				path_branched_rng_2D(kg, rng, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
+				path_branched_rng_2D(kg, state->rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
 
 				/* only sample triangle lights */
 				if(kernel_data.integrator.num_all_lights)
-					light_t = 0.5f*light_t;
+					light_u = 0.5f*light_u;
 
 				LightSample ls;
-				light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
+				light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 				float3 tp = throughput;
 
 				/* sample position on volume segment */
-				float rphase = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_PHASE);
-				float rscatter = path_branched_rng_1D_for_decision(kg, rng, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+				float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+				float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
 
 				VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 					state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 					
-				(void)result;
-				kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 				/* todo: split up light_sample so we don't have to call it again with new position */
-				if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+				if(result == VOLUME_PATH_SCATTERED &&
+				   light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
 
-					float terminate = path_branched_rng_light_termination(kg, rng, state, j, num_samples);
+					float terminate = path_branched_rng_light_termination(kg, state->rng_hash, state, j, num_samples);
 					if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 						/* trace shadow ray */
 						float3 shadow;
 
-						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+						if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
 							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
@@ -242,34 +229,31 @@ ccl_device void kernel_branched_path_volume_connect_light(
 	}
 	else {
 		/* sample random position on random light */
-		float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 		float light_u, light_v;
-		path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+		path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
 
 		LightSample ls;
-		light_sample(kg, light_t, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
+		light_sample(kg, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
 
 		float3 tp = throughput;
 
 		/* sample position on volume segment */
-		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
-		float rscatter = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+		float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+		float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
 
 		VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
 			state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
 			
-		(void)result;
-		kernel_assert(result == VOLUME_PATH_SCATTERED);
-
 		/* todo: split up light_sample so we don't have to call it again with new position */
-		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
+		if(result == VOLUME_PATH_SCATTERED &&
+		   light_sample(kg, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
-			float terminate = path_state_rng_light_termination(kg, rng, state);
+			float terminate = path_state_rng_light_termination(kg, state);
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
 				float3 shadow;
 
-				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
+				if(!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
 					path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
 				}
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index e8a912ccc0b..11798d87cb5 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -18,55 +18,18 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __SOBOL__
-
-/* Skip initial numbers that are not as well distributed, especially the
- * first sequence is just 0 everywhere, which can be problematic for e.g.
- * path termination.
- */
-#define SOBOL_SKIP 64
+/* Pseudo random numbers, uncomment this for debugging correlations. Only run
+ * this single threaded on a CPU for repeatable resutls. */
+//#define __DEBUG_CORRELATION__
 
-/* High Dimensional Sobol. */
 
-/* Van der Corput radical inverse. */
-ccl_device uint van_der_corput(uint bits)
-{
-	bits = (bits << 16) | (bits >> 16);
-	bits = ((bits & 0x00ff00ff) << 8) | ((bits & 0xff00ff00) >> 8);
-	bits = ((bits & 0x0f0f0f0f) << 4) | ((bits & 0xf0f0f0f0) >> 4);
-	bits = ((bits & 0x33333333) << 2) | ((bits & 0xcccccccc) >> 2);
-	bits = ((bits & 0x55555555) << 1) | ((bits & 0xaaaaaaaa) >> 1);
-	return bits;
-}
-
-/* Sobol radical inverse. */
-ccl_device uint sobol(uint i)
-{
-	uint r = 0;
-	for(uint v = 1U << 31; i; i >>= 1, v ^= v >> 1) {
-		if(i & 1) {
-			r ^= v;
-		}
-	}
-	return r;
-}
+/* High Dimensional Sobol.
+ *
+ * Multidimensional sobol with generator matrices. Dimension 0 and 1 are equal
+ * to classic Van der Corput and Sobol sequences. */
 
-/* Inverse of sobol radical inverse. */
-ccl_device uint sobol_inverse(uint i)
-{
-	const uint msb = 1U << 31;
-	uint r = 0;
-	for(uint v = 1; i; i <<= 1, v ^= v << 1) {
-		if(i & msb) {
-			r ^= v;
-		}
-	}
-	return r;
-}
+#ifdef __SOBOL__
 
-/* Multidimensional sobol with generator matrices
- * dimension 0 and 1 are equal to van_der_corput() and sobol() respectively.
- */
 ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 {
 	uint result = 0;
@@ -79,51 +42,32 @@ ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
 	return result;
 }
 
-/* Lookup index and x/y coordinate, assumes m is a power of two. */
-ccl_device uint sobol_lookup(const uint m,
-                             const uint frame,
-                             const uint ex,
-                             const uint ey,
-                             uint *x, uint *y)
-{
-	/* Shift is constant per frame. */
-	const uint shift = frame << (m << 1);
-	const uint sobol_shift = sobol(shift);
-	/* Van der Corput is its own inverse. */
-	const uint lower = van_der_corput(ex << (32 - m));
-	/* Need to compensate for ey difference and shift. */
-	const uint sobol_lower = sobol(lower);
-	const uint mask = ~-(1 << m) << (32 - m);  /* Only m upper bits. */
-	const uint delta = ((ey << (32 - m)) ^ sobol_lower ^ sobol_shift) & mask;
-	/* Only use m upper bits for the index (m is a power of two). */
-	const uint sobol_result = delta | (delta >> m);
-	const uint upper = sobol_inverse(sobol_result);
-	const uint index = shift | upper | lower;
-	*x = van_der_corput(index);
-	*y = sobol_shift ^ sobol_result ^ sobol_lower;
-	return index;
-}
+#endif /* __SOBOL__ */
+
 
 ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
-                                         RNG *rng,
+                                         uint rng_hash,
                                          int sample, int num_samples,
                                          int dimension)
 {
+#ifdef __DEBUG_CORRELATION__
+	return (float)drand48();
+#endif
+
 #ifdef __CMJ__
-	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
+#  ifdef __SOBOL__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
+#  endif
+	{
 		/* Correlated multi-jitter. */
-		int p = *rng + dimension;
+		int p = rng_hash + dimension;
 		return cmj_sample_1D(sample, num_samples, p);
 	}
 #endif
 
-#ifdef __SOBOL_FULL_SCREEN__
-	uint result = sobol_dimension(kg, *rng, dimension);
-	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
-	return r;
-#else
-	/* Compute sobol sequence value using direction vectors. */
-	uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
+#ifdef __SOBOL__
+	/* Sobol sequence value using direction vectors. */
+	uint result = sobol_dimension(kg, sample, dimension);
 	float r = (float)result * (1.0f/(float)0xFFFFFFFF);
 
 	/* Cranly-Patterson rotation using rng seed */
@@ -132,7 +76,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
 	/* Hash rng with dimension to solve correlation issues.
 	 * See T38710, T50116.
 	 */
-	RNG tmp_rng = cmj_hash_simple(dimension, *rng);
+	uint tmp_rng = cmj_hash_simple(dimension, rng_hash);
 	shift = tmp_rng * (1.0f/(float)0xFFFFFFFF);
 
 	return r + shift - floorf(r + shift);
@@ -140,128 +84,60 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
 }
 
 ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
-                                        RNG *rng,
+                                        uint rng_hash,
                                         int sample, int num_samples,
                                         int dimension,
                                         float *fx, float *fy)
 {
+#ifdef __DEBUG_CORRELATION__
+	*fx = (float)drand48();
+	*fy = (float)drand48();
+	return;
+#endif
+
 #ifdef __CMJ__
-	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
+#  ifdef __SOBOL__
+	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
+#  endif
+	{
 		/* Correlated multi-jitter. */
-		int p = *rng + dimension;
+		int p = rng_hash + dimension;
 		cmj_sample_2D(sample, num_samples, p, fx, fy);
+		return;
 	}
-	else
 #endif
-	{
-		/* Sobol. */
-		*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
-		*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
-	}
+
+#ifdef __SOBOL__
+	/* Sobol. */
+	*fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
+	*fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
+#endif
 }
 
 ccl_device_inline void path_rng_init(KernelGlobals *kg,
                                      ccl_global uint *rng_state,
                                      int sample, int num_samples,
-                                     RNG *rng,
+                                     uint *rng_hash,
                                      int x, int y,
                                      float *fx, float *fy)
 {
-#ifdef __SOBOL_FULL_SCREEN__
-	uint px, py;
-	uint bits = 16; /* limits us to 65536x65536 and 65536 samples */
-	uint size = 1 << bits;
-	uint frame = sample;
-
-	*rng = sobol_lookup(bits, frame, x, y, &px, &py);
-
-	*rng ^= kernel_data.integrator.seed;
-
-	if(sample == 0) {
-		*fx = 0.5f;
-		*fy = 0.5f;
-	}
-	else {
-		*fx = size * (float)px * (1.0f/(float)0xFFFFFFFF) - x;
-		*fy = size * (float)py * (1.0f/(float)0xFFFFFFFF) - y;
-	}
-#else
-	*rng = *rng_state;
-
-	*rng ^= kernel_data.integrator.seed;
-
-	if(sample == 0) {
-		*fx = 0.5f;
-		*fy = 0.5f;
-	}
-	else {
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
-	}
-#endif
-}
-
-ccl_device void path_rng_end(KernelGlobals *kg,
-                             ccl_global uint *rng_state,
-                             RNG rng)
-{
-	/* nothing to do */
-}
-
-#else  /* __SOBOL__ */
-
-/* Linear Congruential Generator */
-
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
-                                         RNG *rng,
-                                         int sample, int num_samples,
-                                         int dimension)
-{
-	/* implicit mod 2^32 */
-	*rng = (1103515245*(*rng) + 12345);
-	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
-}
-
-ccl_device_inline void path_rng_2D(KernelGlobals *kg,
-                                   RNG *rng,
-                                   int sample, int num_samples,
-                                   int dimension,
-                                   float *fx, float *fy)
-{
-	*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
-	*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
-}
-
-ccl_device void path_rng_init(KernelGlobals *kg,
-                              ccl_global uint *rng_state,
-                              int sample, int num_samples,
-                              RNG *rng,
-                              int x, int y,
-                              float *fx, float *fy)
-{
 	/* load state */
-	*rng = *rng_state;
+	*rng_hash = *rng_state;
+	*rng_hash ^= kernel_data.integrator.seed;
 
-	*rng ^= kernel_data.integrator.seed;
+#ifdef __DEBUG_CORRELATION__
+	srand48(*rng_hash + sample);
+#endif
 
 	if(sample == 0) {
 		*fx = 0.5f;
 		*fy = 0.5f;
 	}
 	else {
-		path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy);
+		path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
 	}
 }
 
-ccl_device void path_rng_end(KernelGlobals *kg,
-                             ccl_global uint *rng_state,
-                             RNG rng)
-{
-	/* store state for next sample */
-	*rng_state = rng;
-}
-
-#endif  /* __SOBOL__ */
-
 /* Linear Congruential Generator */
 
 ccl_device uint lcg_step_uint(uint *rng)
@@ -295,44 +171,22 @@ ccl_device uint lcg_init(uint seed)
  */
 
 ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
-                                          RNG *rng,
                                           const ccl_addr_space PathState *state,
                                           int dimension)
 {
 	return path_rng_1D(kg,
-	                   rng,
+	                   state->rng_hash,
 	                   state->sample, state->num_samples,
 	                   state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_state_rng_1D_for_decision(
-        KernelGlobals *kg,
-        RNG *rng,
-        const ccl_addr_space PathState *state,
-        int dimension)
-{
-	/* The rng_offset is not increased for transparent bounces. if we do then
-	 * fully transparent objects can become subtly visible by the different
-	 * sampling patterns used where the transparent object is.
-	 *
-	 * however for some random numbers that will determine if we next bounce
-	 * is transparent we do need to increase the offset to avoid always making
-	 * the same decision. */
-	const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
-	return path_rng_1D(kg,
-	                   rng,
-	                   state->sample, state->num_samples,
-	                   rng_offset + dimension);
-}
-
 ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
-                                         RNG *rng,
                                          const ccl_addr_space PathState *state,
                                          int dimension,
                                          float *fx, float *fy)
 {
 	path_rng_2D(kg,
-	            rng,
+	            state->rng_hash,
 	            state->sample, state->num_samples,
 	            state->rng_offset + dimension,
 	            fx, fy);
@@ -340,38 +194,22 @@ ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
 
 ccl_device_inline float path_branched_rng_1D(
         KernelGlobals *kg,
-        RNG *rng,
+        uint rng_hash,
         const ccl_addr_space PathState *state,
         int branch,
         int num_branches,
         int dimension)
 {
 	return path_rng_1D(kg,
-	                   rng,
+	                   rng_hash,
 	                   state->sample * num_branches + branch,
 	                   state->num_samples * num_branches,
 	                   state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(
-        KernelGlobals *kg,
-        RNG *rng,
-        const ccl_addr_space PathState *state,
-        int branch,
-        int num_branches,
-        int dimension)
-{
-	const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
-	return path_rng_1D(kg,
-	                   rng,
-	                   state->sample * num_branches + branch,
-	                   state->num_samples * num_branches,
-	                   rng_offset + dimension);
-}
-
 ccl_device_inline void path_branched_rng_2D(
         KernelGlobals *kg,
-        RNG *rng,
+        uint rng_hash,
         const ccl_addr_space PathState *state,
         int branch,
         int num_branches,
@@ -379,7 +217,7 @@ ccl_device_inline void path_branched_rng_2D(
         float *fx, float *fy)
 {
 	path_rng_2D(kg,
-	            rng,
+	            rng_hash,
 	            state->sample * num_branches + branch,
 	            state->num_samples * num_branches,
 	            state->rng_offset + dimension,
@@ -391,52 +229,45 @@ ccl_device_inline void path_branched_rng_2D(
  */
 ccl_device_inline float path_state_rng_light_termination(
         KernelGlobals *kg,
-        RNG *rng,
         const ccl_addr_space PathState *state)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
+		return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
 	}
 	return 0.0f;
 }
 
 ccl_device_inline float path_branched_rng_light_termination(
         KernelGlobals *kg,
-        RNG *rng,
+        uint rng_hash,
         const ccl_addr_space PathState *state,
         int branch,
         int num_branches)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		return path_branched_rng_1D_for_decision(kg,
-		                                         rng,
-		                                         state,
-		                                         branch,
-		                                         num_branches,
-		                                         PRNG_LIGHT_TERMINATE);
+		return path_branched_rng_1D(kg,
+		                            rng_hash,
+		                            state,
+		                            branch,
+		                            num_branches,
+		                            PRNG_LIGHT_TERMINATE);
 	}
 	return 0.0f;
 }
 
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
-                                         int branch,
-                                         int num_branches)
+ccl_device_inline uint lcg_state_init(PathState *state,
+                                      uint scramble)
 {
-	/* path is splitting into a branch, adjust so that each branch
-	 * still gets a unique sample from the same sequence */
-	state->rng_offset += PRNG_BOUNCE_NUM;
-	state->sample = state->sample*num_branches + branch;
-	state->num_samples = state->num_samples*num_branches;
+	return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble);
 }
 
-ccl_device_inline uint lcg_state_init(RNG *rng,
-                                      int rng_offset,
-                                      int sample,
-                                      uint scramble)
+ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state,
+                                                uint scramble)
 {
-	return lcg_init(*rng + rng_offset + sample*scramble);
+	return lcg_init(state->rng_hash + state->rng_offset + state->sample*scramble);
 }
 
+
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
 {
 	/* Implicit mod 2^32 */
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index c66f52255f0..eeb4eb0097f 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -66,8 +66,8 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	sd->time = ray->time;
 #endif
+	sd->time = ray->time;
 
 	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
 	sd->ray_length = isect->t;
@@ -83,7 +83,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
 
 		sd->shader = __float_as_int(curvedata.z);
-		sd->P = bvh_curve_refine(kg, sd, isect, ray);
+		sd->P = curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
@@ -271,17 +271,17 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 	sd->u = u;
 	sd->v = v;
 #endif
+	sd->time = time;
 	sd->ray_length = t;
 
 	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
 	sd->object_flag = 0;
 	if(sd->object != OBJECT_NONE) {
 		sd->object_flag |= kernel_tex_fetch(__object_flag,
-		                                               sd->object);
+		                                    sd->object);
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
-		sd->time = time;
 	}
 	else if(lamp != LAMP_NONE) {
 		sd->ob_tfm  = lamp_fetch_transform(kg, lamp, false);
@@ -385,9 +385,7 @@ ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderDat
 	sd->shader = kernel_data.background.surface_shader;
 	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
 	sd->object_flag = 0;
-#ifdef __OBJECT_MOTION__
 	sd->time = ray->time;
-#endif
 	sd->ray_length = 0.0f;
 
 #ifdef __INSTANCING__
@@ -427,9 +425,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *s
 	sd->shader = SHADER_NONE;
 	sd->flag = 0;
 	sd->object_flag = 0;
-#ifdef __OBJECT_MOTION__
 	sd->time = ray->time;
-#endif
 	sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
 
 #ifdef __INSTANCING__
@@ -498,20 +494,45 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
 }
 #endif
 
+/* Defensive sampling. */
+
+ccl_device_inline void shader_prepare_closures(ShaderData *sd,
+                                               ccl_addr_space PathState *state)
+{
+	/* We can likely also do defensive sampling at deeper bounces, particularly
+	 * for cases like a perfect mirror but possibly also others. This will need
+	 * a good heuristic. */
+	if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+		float sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			ShaderClosure *sc = &sd->closure[i];
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				sum += sc->sample_weight;
+			}
+		}
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			ShaderClosure *sc = &sd->closure[i];
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
+			}
+		}
+	}
+}
+
+
 /* BSDF */
 
 ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf,
-	int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
+	const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
 	for(int i = 0; i < sd->num_closure; i++) {
-		if(i == skip_bsdf)
-			continue;
-
 		const ShaderClosure *sc = &sd->closure[i];
 
-		if(CLOSURE_IS_BSDF(sc->type)) {
+		if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
 
@@ -574,7 +595,7 @@ void shader_bsdf_eval(KernelGlobals *kg,
 #endif
 	{
 		float pdf;
-		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
+		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
 		if(use_mis) {
 			float weight = power_heuristic(light_pdf, pdf);
 			bsdf_eval_mis(eval, weight);
@@ -582,48 +603,120 @@ void shader_bsdf_eval(KernelGlobals *kg,
 	}
 }
 
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float randu, float randv,
-                                         BsdfEval *bsdf_eval,
-                                         float3 *omega_in,
-                                         differential3 *domega_in,
-                                         float *pdf)
+ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd,
+                                                        float *randu)
 {
 	int sampled = 0;
 
 	if(sd->num_closure > 1) {
-		/* pick a BSDF closure based on sample weights */
+		/* Pick a BSDF or based on sample weights. */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
-			
-			if(CLOSURE_IS_BSDF(sc->type))
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
+			}
 		}
 
-		float r = sd->randb_closure*sum;
-		sum = 0.0f;
+		float r = (*randu)*sum;
+		float partial_sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
 
-		for(sampled = 0; sampled < sd->num_closure; sampled++) {
-			const ShaderClosure *sc = &sd->closure[sampled];
-			
 			if(CLOSURE_IS_BSDF(sc->type)) {
-				sum += sc->sample_weight;
+				float next_sum = partial_sum + sc->sample_weight;
+
+				if(r < next_sum) {
+					sampled = i;
 
-				if(r <= sum)
+					/* Rescale to reuse for direction sample, to better
+					 * preserve stratifaction. */
+					*randu = (r - partial_sum) / sc->sample_weight;
 					break;
+				}
+
+				partial_sum = next_sum;
 			}
 		}
+	}
 
-		if(sampled == sd->num_closure) {
-			*pdf = 0.0f;
-			return LABEL_NONE;
+	return &sd->closure[sampled];
+}
+
+ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
+                                                          ccl_addr_space float3 *throughput,
+                                                          float *randu)
+{
+	int sampled = 0;
+
+	if(sd->num_closure > 1) {
+		/* Pick a BSDF or BSSRDF or based on sample weights. */
+		float sum_bsdf = 0.0f;
+		float sum_bssrdf = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF(sc->type)) {
+				sum_bsdf += sc->sample_weight;
+			}
+			else if(CLOSURE_IS_BSSRDF(sc->type)) {
+				sum_bssrdf += sc->sample_weight;
+			}
+		}
+
+		float r = (*randu)*(sum_bsdf + sum_bssrdf);
+		float partial_sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+				float next_sum = partial_sum + sc->sample_weight;
+
+				if(r < next_sum) {
+					if(CLOSURE_IS_BSDF(sc->type)) {
+						*throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
+						return NULL;
+					}
+					else {
+						*throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
+						sampled = i;
+
+						/* Rescale to reuse for direction sample, to better
+						 * preserve stratifaction. */
+						*randu = (r - partial_sum) / sc->sample_weight;
+						break;
+					}
+				}
+
+				partial_sum = next_sum;
+			}
 		}
 	}
 
-	const ShaderClosure *sc = &sd->closure[sampled];
+	return &sd->closure[sampled];
+}
+
+ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float randu, float randv,
+                                         BsdfEval *bsdf_eval,
+                                         float3 *omega_in,
+                                         differential3 *domega_in,
+                                         float *pdf)
+{
+	const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
+	if(sc == NULL) {
+		*pdf = 0.0f;
+		return LABEL_NONE;
+	}
+
+	/* BSSRDF should already have been handled elsewhere. */
+	kernel_assert(CLOSURE_IS_BSDF(sc->type));
 
 	int label;
 	float3 eval;
@@ -636,7 +729,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 
 		if(sd->num_closure > 1) {
 			float sweight = sc->sample_weight;
-			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
+			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight);
 		}
 	}
 
@@ -669,7 +762,7 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
 	}
 }
 
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
 {
 	if(sd->flag & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);
@@ -677,7 +770,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
 	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@@ -764,6 +857,19 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 	return eval;
 }
 
+ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+{
+	float3 N = make_float3(0.0f, 0.0f, 0.0f);
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+		if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+			N += sc->N*average(sc->weight);
+	}
+
+	return (is_zero(N))? sd->N : normalize(N);
+}
+
 ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -783,12 +889,7 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 	}
 
-	if(is_zero(N))
-		N = sd->N;
-	else
-		N = normalize(N);
-
-	*N_ = N;
+	*N_ = (is_zero(N))? sd->N : normalize(N);
 	return eval;
 }
 
@@ -863,16 +964,15 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng,
-	ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
+ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
+	ccl_addr_space PathState *state, int path_flag)
 {
 	sd->num_closure = 0;
 	sd->num_closure_extra = 0;
-	sd->randb_closure = randb;
 
 #ifdef __OSL__
 	if(kg->osl)
-		OSLShader::eval_surface(kg, sd, state, path_flag, ctx);
+		OSLShader::eval_surface(kg, sd, state, path_flag);
 	else
 #endif
 	{
@@ -887,24 +987,23 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng,
 #endif
 	}
 
-	if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) {
-		sd->lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0xb4bc3953);
+	if(sd->flag & SD_BSDF_NEEDS_LCG) {
+		sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
 	}
 }
 
 /* Background Evaluation */
 
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
-	ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
+	ccl_addr_space PathState *state, int path_flag)
 {
 	sd->num_closure = 0;
 	sd->num_closure_extra = 0;
-	sd->randb_closure = 0.0f;
 
 #ifdef __SVM__
 #ifdef __OSL__
 	if(kg->osl) {
-		OSLShader::eval_background(kg, sd, state, path_flag, ctx);
+		OSLShader::eval_background(kg, sd, state, path_flag);
 	}
 	else
 #endif
@@ -981,17 +1080,22 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s
 				sum += sc->sample_weight;
 		}
 
-		float r = sd->randb_closure*sum;
-		sum = 0.0f;
+		float r = randu*sum;
+		float partial_sum = 0.0f;
 
 		for(sampled = 0; sampled < sd->num_closure; sampled++) {
 			const ShaderClosure *sc = &sd->closure[sampled];
 			
 			if(CLOSURE_IS_PHASE(sc->type)) {
-				sum += sc->sample_weight;
+				float next_sum = partial_sum + sc->sample_weight;
 
-				if(r <= sum)
+				if(r <= next_sum) {
+					/* Rescale to reuse for BSDF direction sample. */
+					randu = (r - partial_sum) / sc->sample_weight;
 					break;
+				}
+
+				partial_sum = next_sum;
 			}
 		}
 
@@ -1039,8 +1143,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
                                           ShaderData *sd,
                                           ccl_addr_space PathState *state,
                                           ccl_addr_space VolumeStack *stack,
-                                          int path_flag,
-                                          ShaderContext ctx)
+                                          int path_flag)
 {
 	/* reset closures once at the start, we will be accumulating the closures
 	 * for all volumes in the stack into a single array of closures */
@@ -1073,7 +1176,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 #ifdef __SVM__
 #  ifdef __OSL__
 		if(kg->osl) {
-			OSLShader::eval_volume(kg, sd, state, path_flag, ctx);
+			OSLShader::eval_volume(kg, sd, state, path_flag);
 		}
 		else
 #  endif
@@ -1092,17 +1195,16 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 /* Displacement Evaluation */
 
-ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
+ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state)
 {
 	sd->num_closure = 0;
 	sd->num_closure_extra = 0;
-	sd->randb_closure = 0.0f;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
 #  ifdef __OSL__
 	if(kg->osl)
-		OSLShader::eval_displacement(kg, sd, ctx);
+		OSLShader::eval_displacement(kg, sd);
 	else
 #  endif
 	{
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index fab5946970d..8a0da6c3b13 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -16,6 +16,42 @@
 
 CCL_NAMESPACE_BEGIN
 
+#ifdef __VOLUME__
+typedef struct VolumeState {
+#  ifdef __SPLIT_KERNEL__
+#  else
+	PathState ps;
+#  endif
+} VolumeState;
+
+/* Get PathState ready for use for volume stack evaluation. */
+#  ifdef __SPLIT_KERNEL__
+ccl_addr_space
+#  endif
+ccl_device_inline PathState *shadow_blocked_volume_path_state(
+        KernelGlobals *kg,
+        VolumeState *volume_state,
+        ccl_addr_space PathState *state,
+        ShaderData *sd,
+        Ray *ray)
+{
+#  ifdef __SPLIT_KERNEL__
+	ccl_addr_space PathState *ps =
+	        &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#  else
+	PathState *ps = &volume_state->ps;
+#  endif
+	*ps = *state;
+	/* We are checking for shadow on the "other" side of the surface, so need
+	 * to discard volume we are currently at.
+	 */
+	if(dot(sd->Ng, ray->D) < 0.0f) {
+		kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
+	}
+	return ps;
+}
+#endif  /* __VOLUME__ */
+
 /* Attenuate throughput accordingly to the given intersection event.
  * Returns true if the throughput is zero and traversal can be aborted.
  */
@@ -49,11 +85,8 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
 		path_state_modify_bounce(state, true);
 		shader_eval_surface(kg,
 		                    shadow_sd,
-		                    NULL,
 		                    state,
-		                    0.0f,
-		                    PATH_RAY_SHADOW,
-		                    SHADER_CONTEXT_SHADOW);
+		                    PATH_RAY_SHADOW);
 		path_state_modify_bounce(state, false);
 		*throughput *= shader_bsdf_transparency(kg, shadow_sd);
 	}
@@ -72,13 +105,14 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
 ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
                                       ShaderData *shadow_sd,
                                       ccl_addr_space PathState *state,
+                                      const uint visibility,
                                       Ray *ray,
                                       Intersection *isect,
                                       float3 *shadow)
 {
 	const bool blocked = scene_intersect(kg,
 	                                     *ray,
-	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     visibility & PATH_RAY_SHADOW_OPAQUE,
 	                                     isect,
 	                                     NULL,
 	                                     0.0f, 0.0f);
@@ -126,9 +160,10 @@ ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
  * Note that hits array should be as big as max_hits+1.
  */
 ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
+                                                    ShaderData *sd,
                                                     ShaderData *shadow_sd,
                                                     ccl_addr_space PathState *state,
-                                                    const int skip_object,
+                                                    const uint visibility,
                                                     Ray *ray,
                                                     Intersection *hits,
                                                     uint max_hits,
@@ -141,9 +176,12 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 	const bool blocked = scene_intersect_shadow_all(kg,
 	                                                ray,
 	                                                hits,
-	                                                skip_object,
+	                                                visibility,
 	                                                max_hits,
 	                                                &num_hits);
+#    ifdef __VOLUME__
+	VolumeState volume_state;
+#    endif
 	/* If no opaque surface found but we did find transparent hits,
 	 * shade them.
 	 */
@@ -155,12 +193,13 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 		Intersection *isect = hits;
 #    ifdef __VOLUME__
 #      ifdef __SPLIT_KERNEL__
-		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-#      else
-		PathState ps_object;
-		PathState *ps = &ps_object;
+		ccl_addr_space
 #      endif
-		*ps = *state;
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
 #    endif
 		sort_intersections(hits, num_hits);
 		for(int hit = 0; hit < num_hits; hit++, isect++) {
@@ -205,8 +244,16 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 	}
 #    ifdef __VOLUME__
 	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* Apply attenuation from current volume shader/ */
-		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+		/* Apply attenuation from current volume shader. */
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
+		kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
 	}
 #    endif
 	return blocked;
@@ -216,9 +263,10 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
  * loop to help readability of the actual logic.
  */
 ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
+                                               ShaderData *sd,
                                                ShaderData *shadow_sd,
                                                ccl_addr_space PathState *state,
-                                               const int skip_object,
+                                               const uint visibility,
                                                Ray *ray,
                                                uint max_hits,
                                                float3 *shadow)
@@ -251,9 +299,10 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
 #    endif  /* __KERNEL_GPU__ */
 	/* Invoke actual traversal. */
 	return shadow_blocked_transparent_all_loop(kg,
+	                                           sd,
 	                                           shadow_sd,
 	                                           state,
-	                                           skip_object,
+	                                           visibility,
 	                                           ray,
 	                                           hits,
 	                                           max_hits,
@@ -276,27 +325,32 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
  */
 ccl_device bool shadow_blocked_transparent_stepped_loop(
         KernelGlobals *kg,
+        ShaderData *sd,
         ShaderData *shadow_sd,
         ccl_addr_space PathState *state,
-        const int skip_object,
+        const uint visibility,
         Ray *ray,
         Intersection *isect,
         const bool blocked,
         const bool is_transparent_isect,
         float3 *shadow)
 {
-	if((blocked && is_transparent_isect) || skip_object != OBJECT_NONE) {
+#    ifdef __VOLUME__
+	VolumeState volume_state;
+#    endif
+	if(blocked && is_transparent_isect) {
 		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 		float3 Pend = ray->P + ray->D*ray->t;
 		int bounce = state->transparent_bounce;
 #    ifdef __VOLUME__
 #      ifdef __SPLIT_KERNEL__
-		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-#      else
-		PathState ps_object;
-		PathState *ps = &ps_object;
+		ccl_addr_space
 #      endif
-		*ps = *state;
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
 #    endif
 		for(;;) {
 			if(bounce >= kernel_data.integrator.transparent_max_bounce) {
@@ -304,30 +358,13 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 			}
 			if(!scene_intersect(kg,
 			                    *ray,
-			                    PATH_RAY_SHADOW_TRANSPARENT,
+			                    visibility & PATH_RAY_SHADOW_TRANSPARENT,
 			                    isect,
 			                    NULL,
 			                    0.0f, 0.0f))
 			{
 				break;
 			}
-#ifdef __SHADOW_TRICKS__
-			if(skip_object != OBJECT_NONE) {
-				const int isect_object = (isect->object == PRIM_NONE)
-				        ? kernel_tex_fetch(__prim_object, isect->prim)
-				        : isect->object;
-				if(isect_object == skip_object) {
-					shader_setup_from_ray(kg, shadow_sd, isect, ray);
-					/* Move ray forward. */
-					ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
-					if(ray->t != FLT_MAX) {
-						ray->D = normalize_len(Pend - ray->P, &ray->t);
-					}
-					bounce++;
-					continue;
-				}
-			}
-#endif
 			if(!shader_transparent_shadow(kg, isect)) {
 				return true;
 			}
@@ -363,7 +400,15 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 #    ifdef __VOLUME__
 	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
 		/* Apply attenuation from current volume shader. */
-		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space
+#      endif
+		PathState *ps = shadow_blocked_volume_path_state(kg,
+		                                                 &volume_state,
+		                                                 state,
+		                                                 sd,
+		                                                 ray);
+		kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
 	}
 #    endif
 	return blocked;
@@ -371,33 +416,28 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 
 ccl_device bool shadow_blocked_transparent_stepped(
         KernelGlobals *kg,
+        ShaderData *sd,
         ShaderData *shadow_sd,
         ccl_addr_space PathState *state,
-        const int skip_object,
+        const uint visibility,
         Ray *ray,
         Intersection *isect,
         float3 *shadow)
 {
-	bool blocked, is_transparent_isect;
-	if(skip_object == OBJECT_NONE) {
-		blocked = scene_intersect(kg,
-		                          *ray,
-		                          PATH_RAY_SHADOW_OPAQUE,
-		                          isect,
-		                          NULL,
-		                          0.0f, 0.0f);
-		is_transparent_isect = blocked
-			        ? shader_transparent_shadow(kg, isect)
-			        : false;
-	}
-	else {
-		blocked = false;
-		is_transparent_isect = false;
-	}
+	bool blocked = scene_intersect(kg,
+	                               *ray,
+	                               visibility & PATH_RAY_SHADOW_OPAQUE,
+	                               isect,
+	                               NULL,
+	                               0.0f, 0.0f);
+	bool is_transparent_isect = blocked
+		? shader_transparent_shadow(kg, isect)
+		: false;
 	return shadow_blocked_transparent_stepped_loop(kg,
+	                                               sd,
 	                                               shadow_sd,
 	                                               state,
-	                                               skip_object,
+	                                               visibility,
 	                                               ray,
 	                                               isect,
 	                                               blocked,
@@ -409,6 +449,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
 #endif /* __TRANSPARENT_SHADOWS__ */
 
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
+                                      ShaderData *sd,
                                       ShaderData *shadow_sd,
                                       ccl_addr_space PathState *state,
                                       Ray *ray_input,
@@ -422,25 +463,24 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
 		return false;
 	}
 #ifdef __SHADOW_TRICKS__
-	const int skip_object = state->catcher_object;
+	const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER)
+		? PATH_RAY_SHADOW_NON_CATCHER
+		: PATH_RAY_SHADOW;
 #else
-	const int skip_object = OBJECT_NONE;
+	const uint visibility = PATH_RAY_SHADOW;
 #endif
 	/* Do actual shadow shading. */
 	/* First of all, we check if integrator requires transparent shadows.
 	 * if not, we use simplest and fastest ever way to calculate occlusion.
-	 *
-	 * NOTE: We can't do quick opaque test here if we are on shadow-catcher
-	 * path because we don't want catcher object to be casting shadow here.
 	 */
 #ifdef __TRANSPARENT_SHADOWS__
-	if(!kernel_data.integrator.transparent_shadows &&
-	   skip_object == OBJECT_NONE)
+	if(!kernel_data.integrator.transparent_shadows)
 #endif
 	{
 		return shadow_blocked_opaque(kg,
 		                             shadow_sd,
 		                             state,
+		                             visibility,
 		                             ray,
 		                             &isect,
 		                             shadow);
@@ -467,7 +507,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
 	 */
 	const bool blocked = scene_intersect(kg,
 	                                     *ray,
-	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     visibility & PATH_RAY_SHADOW_OPAQUE,
 	                                     &isect,
 	                                     NULL,
 	                                     0.0f, 0.0f);
@@ -478,9 +518,10 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
 	   max_hits + 1 >= SHADOW_STACK_MAX_HITS)
 	{
 		return shadow_blocked_transparent_stepped_loop(kg,
+		                                               sd,
 		                                               shadow_sd,
 		                                               state,
-		                                               skip_object,
+		                                               visibility,
 		                                               ray,
 		                                               &isect,
 		                                               blocked,
@@ -489,18 +530,20 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
 	}
 #    endif  /* __KERNEL_GPU__ */
 	return shadow_blocked_transparent_all(kg,
+	                                      sd,
 	                                      shadow_sd,
 	                                      state,
-	                                      skip_object,
+	                                      visibility,
 	                                      ray,
 	                                      max_hits,
 	                                      shadow);
 #  else  /* __SHADOW_RECORD_ALL__ */
 	/* Fallback to a slowest version which works on all devices. */
 	return shadow_blocked_transparent_stepped(kg,
+	                                          sd,
 	                                          shadow_sd,
 	                                          state,
-	                                          skip_object,
+	                                          visibility,
 	                                          ray,
 	                                          &isect,
 	                                          shadow);
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 6475d4b66fd..23a09e5e2ca 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -28,87 +28,31 @@ CCL_NAMESPACE_BEGIN
  * - try to reduce one sample model variance
  */
 
-#define BSSRDF_MULTI_EVAL
-
-ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability)
-{
-	/* sum sample weights of bssrdf and bsdf */
-	float bsdf_sum = 0.0f;
-	float bssrdf_sum = 0.0f;
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSDF(sc->type))
-			bsdf_sum += sc->sample_weight;
-		else if(CLOSURE_IS_BSSRDF(sc->type))
-			bssrdf_sum += sc->sample_weight;
-	}
-
-	/* use bsdf or bssrdf? */
-	float r = sd->randb_closure*(bsdf_sum + bssrdf_sum);
-
-	if(r < bsdf_sum) {
-		/* use bsdf, and adjust randb so we can reuse it for picking a bsdf */
-		sd->randb_closure = r/bsdf_sum;
-		*probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f;
-		return NULL;
-	}
-
-	/* use bssrdf */
-	r -= bsdf_sum;
-
-	float sum = 0.0f;
-
-	for(int i = 0; i < sd->num_closure; i++) {
-		ShaderClosure *sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSSRDF(sc->type)) {
-			sum += sc->sample_weight;
-
-			if(r <= sum) {
-				sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight;
-
-#ifdef BSSRDF_MULTI_EVAL
-				*probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f;
-#else
-				*probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f;
-#endif
-				return sc;
-			}
-		}
-	}
-
-	/* should never happen */
-	sd->randb_closure = 0.0f;
-	*probability = 1.0f;
-	return NULL;
-}
-
 ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
-                                                 ShaderClosure *sc,
+                                                 const ShaderClosure *sc,
                                                  float disk_r,
                                                  float r,
                                                  bool all)
 {
-#ifdef BSSRDF_MULTI_EVAL
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
 	float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f);
 	float pdf_sum = 0.0f;
-	float sample_weight_sum = 0.0f;
-	int num_bssrdf = 0;
+	float sample_weight_inv = 0.0f;
 
-	for(int i = 0; i < sd->num_closure; i++) {
-		sc = &sd->closure[i];
-		
-		if(CLOSURE_IS_BSSRDF(sc->type)) {
-			float sample_weight = (all)? 1.0f: sc->sample_weight;
-			sample_weight_sum += sample_weight;
+	if(!all) {
+		float sample_weight_sum = 0.0f;
+
+		for(int i = 0; i < sd->num_closure; i++) {
+			sc = &sd->closure[i];
+
+			if(CLOSURE_IS_BSSRDF(sc->type)) {
+				sample_weight_sum += sc->sample_weight;
+			}
 		}
-	}
 
-	float sample_weight_inv = 1.0f/sample_weight_sum;
+		sample_weight_inv = 1.0f/sample_weight_sum;
+	}
 
 	for(int i = 0; i < sd->num_closure; i++) {
 		sc = &sd->closure[i];
@@ -125,25 +69,16 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
 			/* TODO power heuristic is not working correct here */
 			eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf;
 			pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf;
-
-			num_bssrdf++;
 		}
 	}
 
 	return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f);
-#else
-	float pdf = bssrdf_pdf(pick_sc, r);
-	float disk_pdf = bssrdf_pdf(pick_sc, disk_r);
-
-	return pick_sc->weight * pdf / disk_pdf;
-#endif
 }
 
 /* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N)
 {
 	sd->flag &= ~SD_CLOSURE_FLAGS;
-	sd->randb_closure = 0.0f;
 	sd->num_closure = 0;
 	sd->num_closure_extra = 0;
 
@@ -219,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
 
 	if(bump || texture_blur > 0.0f) {
 		/* average color and normal at incoming point */
-		shader_eval_surface(kg, sd, NULL, state, 0.0f, state_flag, SHADER_CONTEXT_SSS);
+		shader_eval_surface(kg, sd, state, state_flag);
 		float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
 
 		/* we simply divide out the average color and multiply with the average
@@ -242,8 +177,8 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
         KernelGlobals *kg,
         SubsurfaceIntersection *ss_isect,
         ShaderData *sd,
-        ShaderClosure *sc,
-        RNG *lcg_state,
+        const ShaderClosure *sc,
+        uint *lcg_state,
         float disk_u,
         float disk_v,
         bool all)
@@ -255,26 +190,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	disk_N = sd->Ng;
 	make_orthonormals(disk_N, &disk_T, &disk_B);
 
-	/* reusing variable for picking the closure gives a bit nicer stratification
-	 * for path tracer, for branched we do all closures so it doesn't help */
-	float axisu = (all)? disk_u: sd->randb_closure;
-
-	if(axisu < 0.5f) {
+	if(disk_u < 0.5f) {
 		pick_pdf_N = 0.5f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.25f;
-		if(all)
-			disk_u *= 2.0f;
+		disk_u *= 2.0f;
 	}
-	else if(axisu < 0.75f) {
+	else if(disk_u < 0.75f) {
 		float3 tmp = disk_N;
 		disk_N = disk_T;
 		disk_T = tmp;
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.5f;
 		pick_pdf_B = 0.25f;
-		if(all)
-			disk_u = (disk_u - 0.5f)*4.0f;
+		disk_u = (disk_u - 0.5f)*4.0f;
 	}
 	else {
 		float3 tmp = disk_N;
@@ -283,8 +212,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.5f;
-		if(all)
-			disk_u = (disk_u - 0.75f)*4.0f;
+		disk_u = (disk_u - 0.75f)*4.0f;
 	}
 
 	/* sample point on disk */
@@ -390,7 +318,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
         ShaderData *sd,
         ccl_addr_space PathState *state,
         int state_flag,
-        ShaderClosure *sc,
+        const ShaderClosure *sc,
         bool all)
 {
 #ifdef __SPLIT_KERNEL__
@@ -419,7 +347,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
 ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state,
-	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
+	int state_flag, const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -430,18 +358,20 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a
 	disk_N = sd->Ng;
 	make_orthonormals(disk_N, &disk_T, &disk_B);
 
-	if(sd->randb_closure < 0.5f) {
+	if(disk_u < 0.5f) {
 		pick_pdf_N = 0.5f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.25f;
+		disk_u *= 2.0f;
 	}
-	else if(sd->randb_closure < 0.75f) {
+	else if(disk_u < 0.75f) {
 		float3 tmp = disk_N;
 		disk_N = disk_T;
 		disk_T = tmp;
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.5f;
 		pick_pdf_B = 0.25f;
+		disk_u = (disk_u - 0.5f)*4.0f;
 	}
 	else {
 		float3 tmp = disk_N;
@@ -450,6 +380,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a
 		pick_pdf_N = 0.25f;
 		pick_pdf_T = 0.25f;
 		pick_pdf_B = 0.5f;
+		disk_u = (disk_u - 0.75f)*4.0f;
 	}
 
 	/* sample point on disk */
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index cb1a3f40dee..5eab28a2953 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -82,115 +82,110 @@ KERNEL_TEX(uint, texture_uint, __sobol_directions)
 #  if __CUDA_ARCH__ < 300
 /* full-float image */
 KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_000)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_001)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_002)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_003)
-KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_004)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_008)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_016)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_024)
+KERNEL_IMAGE_TEX(float4, texture_image_float4, __tex_image_float4_032)
 
 KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_000)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_001)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_002)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_003)
-KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_004)
-
-/* image */
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_005)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_006)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_007)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_008)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_008)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_016)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_024)
+KERNEL_IMAGE_TEX(float4, texture_image3d_float4, __tex_image_float4_3d_032)
+
+/* image
+ * These texture names are encoded to their flattened slots as
+ * ImageManager::type_index_to_flattened_slot() returns them. */
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_001)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_009)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_010)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_011)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_012)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_013)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_014)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_015)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_016)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_017)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_018)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_019)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_020)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_021)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_022)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_023)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_024)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_025)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_026)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_027)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_028)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_029)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_030)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_031)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_032)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_033)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_034)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_035)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_036)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_037)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_038)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_039)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_040)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_041)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_042)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_043)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_044)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_045)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_046)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_047)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_048)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_049)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_050)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_051)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_052)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_053)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_054)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_055)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_056)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_057)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_058)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_059)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_060)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_061)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_062)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_063)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_064)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_065)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_066)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_067)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_068)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_069)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_070)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_071)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_072)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_073)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_074)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_075)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_076)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_077)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_078)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_079)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_080)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_081)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_082)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_083)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_084)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_097)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_105)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_113)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_121)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_129)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_137)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_145)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_153)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_161)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_169)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_177)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_185)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_193)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_201)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_209)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_217)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_225)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_233)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_241)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_249)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_257)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_265)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_273)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_281)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_289)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_297)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_305)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_313)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_321)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_329)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_337)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_345)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_353)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_361)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_369)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_377)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_385)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_393)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_401)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_409)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_417)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_425)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_433)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_441)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_449)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_457)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_465)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_473)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_481)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_489)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_497)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_505)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_513)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_521)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_529)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_537)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_545)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_553)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_561)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_569)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_577)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_585)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_593)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_601)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_609)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_617)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_625)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_633)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_641)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_649)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_657)
+KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_665)
 
 #  else
 /* bindless textures */
 KERNEL_TEX(uint, texture_uint, __bindless_mapping)
-#  endif
-#endif
-
-/* packed image (opencl) */
-KERNEL_TEX(uchar4, texture_uchar4, __tex_image_byte4_packed)
-KERNEL_TEX(float4, texture_float4, __tex_image_float4_packed)
-KERNEL_TEX(uchar, texture_uchar, __tex_image_byte_packed)
-KERNEL_TEX(float, texture_float, __tex_image_float_packed)
-KERNEL_TEX(uint4, texture_uint4, __tex_image_packed_info)
+#  endif  /* __CUDA_ARCH__ */
+#endif  /* __KERNEL_CUDA__ */
 
 #undef KERNEL_TEX
 #undef KERNEL_IMAGE_TEX
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 34affab1b9d..6c5b6ca3b2d 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -130,6 +130,7 @@ CCL_NAMESPACE_BEGIN
 #  ifdef __KERNEL_OPENCL_APPLE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __PRINCIPLED__
 #    define __CMJ__
 /* TODO(sergey): Currently experimental section is ignored here,
  * this is because megakernel in device_opencl does not support
@@ -154,6 +155,7 @@ CCL_NAMESPACE_BEGIN
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __PRINCIPLED__
 #    define __CMJ__
 #  endif  /* __KERNEL_OPENCL_INTEL_CPU__ */
 
@@ -240,10 +242,6 @@ CCL_NAMESPACE_BEGIN
 #  undef __DENOISING_FEATURES__
 #endif
 
-/* Random Numbers */
-
-typedef uint RNG;
-
 /* Shader Evaluation */
 
 typedef enum ShaderEvalType {
@@ -283,31 +281,21 @@ enum PathTraceDimension {
 	PRNG_FILTER_V = 1,
 	PRNG_LENS_U = 2,
 	PRNG_LENS_V = 3,
-#ifdef __CAMERA_MOTION__
 	PRNG_TIME = 4,
 	PRNG_UNUSED_0 = 5,
 	PRNG_UNUSED_1 = 6,	/* for some reason (6, 7) is a bad sobol pattern */
 	PRNG_UNUSED_2 = 7,  /* with a low number of samples (< 64) */
-#endif
-	PRNG_BASE_NUM = 8,
+	PRNG_BASE_NUM = 10,
 
 	PRNG_BSDF_U = 0,
 	PRNG_BSDF_V = 1,
-	PRNG_BSDF = 2,
-	PRNG_LIGHT = 3,
-	PRNG_LIGHT_U = 4,
-	PRNG_LIGHT_V = 5,
-	PRNG_LIGHT_TERMINATE = 6,
-	PRNG_TERMINATE = 7,
-
-#ifdef __VOLUME__
-	PRNG_PHASE_U = 8,
-	PRNG_PHASE_V = 9,
-	PRNG_PHASE = 10,
-	PRNG_SCATTER_DISTANCE = 11,
-#endif
-
-	PRNG_BOUNCE_NUM = 12,
+	PRNG_LIGHT_U = 2,
+	PRNG_LIGHT_V = 3,
+	PRNG_LIGHT_TERMINATE = 4,
+	PRNG_TERMINATE = 5,
+	PRNG_PHASE_CHANNEL = 6,
+	PRNG_SCATTER_DISTANCE = 7,
+	PRNG_BOUNCE_NUM = 8,
 };
 
 enum SamplingPattern {
@@ -328,24 +316,28 @@ enum PathRayFlag {
 	PATH_RAY_SINGULAR            = (1 << 5),
 	PATH_RAY_TRANSPARENT         = (1 << 6),
 
-	PATH_RAY_SHADOW_OPAQUE       = (1 << 7),
-	PATH_RAY_SHADOW_TRANSPARENT  = (1 << 8),
-	PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
+	PATH_RAY_SHADOW_OPAQUE_NON_CATCHER       = (1 << 7),
+	PATH_RAY_SHADOW_OPAQUE_CATCHER           = (1 << 8),
+	PATH_RAY_SHADOW_OPAQUE                   = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_OPAQUE_CATCHER),
+	PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER  = (1 << 9),
+	PATH_RAY_SHADOW_TRANSPARENT_CATCHER      = (1 << 10),
+	PATH_RAY_SHADOW_TRANSPARENT              = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
+	PATH_RAY_SHADOW_NON_CATCHER              = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER|PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+	PATH_RAY_SHADOW                          = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
 
-	PATH_RAY_CURVE               = (1 << 9), /* visibility flag to define curve segments */
-	PATH_RAY_VOLUME_SCATTER      = (1 << 10), /* volume scattering */
+	PATH_RAY_CURVE               = (1 << 11), /* visibility flag to define curve segments */
+	PATH_RAY_VOLUME_SCATTER      = (1 << 12), /* volume scattering */
 
 	/* Special flag to tag unaligned BVH nodes. */
-	PATH_RAY_NODE_UNALIGNED = (1 << 11),
+	PATH_RAY_NODE_UNALIGNED = (1 << 13),
 
-	PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
+	PATH_RAY_ALL_VISIBILITY = ((1 << 14)-1),
 
-	PATH_RAY_MIS_SKIP            = (1 << 12),
-	PATH_RAY_DIFFUSE_ANCESTOR    = (1 << 13),
-	PATH_RAY_SINGLE_PASS_DONE    = (1 << 14),
-	PATH_RAY_SHADOW_CATCHER      = (1 << 15),
-	PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
-	PATH_RAY_STORE_SHADOW_INFO   = (1 << 17),
+	PATH_RAY_MIS_SKIP            = (1 << 15),
+	PATH_RAY_DIFFUSE_ANCESTOR    = (1 << 16),
+	PATH_RAY_SINGLE_PASS_DONE    = (1 << 17),
+	PATH_RAY_SHADOW_CATCHER      = (1 << 18),
+	PATH_RAY_STORE_SHADOW_INFO   = (1 << 19),
 };
 
 /* Closure Label */
@@ -462,18 +454,42 @@ typedef enum DenoiseFlag {
 	DENOISING_CLEAN_ALL_PASSES       = (1 << 8)-1,
 } DenoiseFlag;
 
+#ifdef __KERNEL_DEBUG__
+/* NOTE: This is a runtime-only struct, alignment is not
+ * really important here.
+ */
+typedef struct DebugData {
+	int num_bvh_traversed_nodes;
+	int num_bvh_traversed_instances;
+	int num_bvh_intersections;
+	int num_ray_bounces;
+} DebugData;
+#endif
+
+typedef ccl_addr_space struct PathRadianceState {
+#ifdef __PASSES__
+	float3 diffuse;
+	float3 glossy;
+	float3 transmission;
+	float3 subsurface;
+	float3 scatter;
+
+	float3 direct;
+#endif
+} PathRadianceState;
+
 typedef ccl_addr_space struct PathRadiance {
 #ifdef __PASSES__
 	int use_light_pass;
 #endif
 
+	float transparent;
 	float3 emission;
 #ifdef __PASSES__
 	float3 background;
 	float3 ao;
 
 	float3 indirect;
-	float3 direct_throughput;
 	float3 direct_emission;
 
 	float3 color_diffuse;
@@ -494,16 +510,12 @@ typedef ccl_addr_space struct PathRadiance {
 	float3 indirect_subsurface;
 	float3 indirect_scatter;
 
-	float3 path_diffuse;
-	float3 path_glossy;
-	float3 path_transmission;
-	float3 path_subsurface;
-	float3 path_scatter;
-
 	float4 shadow;
 	float mist;
 #endif
 
+	struct PathRadianceState state;
+
 #ifdef __SHADOW_TRICKS__
 	/* Total light reachable across the path, ignoring shadow blocked queries. */
 	float3 path_total;
@@ -515,7 +527,18 @@ typedef ccl_addr_space struct PathRadiance {
 	float3 path_total_shaded;
 
 	/* Color of the background on which shadow is alpha-overed. */
-	float3 shadow_color;
+	float3 shadow_background_color;
+
+	/* Path radiance sum and throughput at the moment when ray hits shadow
+	 * catcher object.
+	 */
+	float shadow_throughput;
+
+	/* Accumulated transparency along the path after shadow catcher bounce. */
+	float shadow_transparency;
+
+	/* Indicate if any shadow catcher data is set. */
+	int has_shadow_catcher;
 #endif
 
 #ifdef __DENOISING_FEATURES__
@@ -523,6 +546,10 @@ typedef ccl_addr_space struct PathRadiance {
 	float3 denoising_albedo;
 	float denoising_depth;
 #endif  /* __DENOISING_FEATURES__ */
+
+#ifdef __KERNEL_DEBUG__
+	DebugData debug_data;
+#endif /* __KERNEL_DEBUG__ */
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -774,20 +801,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
 	float data[10]; /* pad to 80 bytes */
 } ShaderClosure;
 
-/* Shader Context
- *
- * For OSL we recycle a fixed number of contexts for speed */
-
-typedef enum ShaderContext {
-	SHADER_CONTEXT_MAIN = 0,
-	SHADER_CONTEXT_INDIRECT = 1,
-	SHADER_CONTEXT_EMISSION = 2,
-	SHADER_CONTEXT_SHADOW = 3,
-	SHADER_CONTEXT_SSS = 4,
-	SHADER_CONTEXT_VOLUME = 5,
-	SHADER_CONTEXT_NUM = 6
-} ShaderContext;
-
 /* Shader Data
  *
  * Main shader state at a point on the surface or in a volume. All coordinates
@@ -850,7 +863,7 @@ enum ShaderDataFlag {
 	SD_VOLUME_MIS             = (1 << 23),
 	/* Use cubic interpolation for voxels. */
 	SD_VOLUME_CUBIC           = (1 << 24),
-	/* Has data connected to the displacement input. */
+	/* Has data connected to the displacement input or uses bump map. */
 	SD_HAS_BUMP               = (1 << 25),
 	/* Has true displacement. */
 	SD_HAS_DISPLACEMENT       = (1 << 26),
@@ -991,9 +1004,11 @@ typedef struct PathState {
 	int flag;
 
 	/* random number generator state */
-	int rng_offset;    		/* dimension offset */
-	int sample;        		/* path sample number */
-	int num_samples;		/* total number of times this path will be sampled */
+	uint rng_hash;          /* per pixel hash */
+	int rng_offset;         /* dimension offset */
+	int sample;             /* path sample number */
+	int num_samples;        /* total number of times this path will be sampled */
+	float branch_factor;    /* number of branches in indirect paths */
 
 	/* bounce counting */
 	int bounce;
@@ -1016,20 +1031,15 @@ typedef struct PathState {
 	/* volume rendering */
 #ifdef __VOLUME__
 	int volume_bounce;
-	RNG rng_congruential;
+	uint rng_congruential;
 	VolumeStack volume_stack[VOLUME_STACK_SIZE];
 #endif
-
-#ifdef __SHADOW_TRICKS__
-	int catcher_object;
-#endif
 } PathState;
 
 /* Subsurface */
 
 /* Struct to gather multiple SSS hits. */
-typedef struct SubsurfaceIntersection
-{
+typedef struct SubsurfaceIntersection {
 	Ray ray;
 	float3 weight[BSSRDF_MAX_HITS];
 
@@ -1039,17 +1049,14 @@ typedef struct SubsurfaceIntersection
 } SubsurfaceIntersection;
 
 /* Struct to gather SSS indirect rays and delay tracing them. */
-typedef struct SubsurfaceIndirectRays
-{
-	bool need_update_volume_stack;
-	bool tracing;
+typedef struct SubsurfaceIndirectRays {
 	PathState state[BSSRDF_MAX_HITS];
-	struct PathRadiance direct_L;
 
 	int num_rays;
+
 	struct Ray rays[BSSRDF_MAX_HITS];
 	float3 throughputs[BSSRDF_MAX_HITS];
-	struct PathRadiance L[BSSRDF_MAX_HITS];
+	struct PathRadianceState L_state[BSSRDF_MAX_HITS];
 } SubsurfaceIndirectRays;
 
 /* Constant Kernel Data
@@ -1228,7 +1235,6 @@ typedef struct KernelIntegrator {
 	int portal_offset;
 
 	/* bounces */
-	int min_bounce;
 	int max_bounce;
 
 	int max_diffuse_bounce;
@@ -1239,7 +1245,6 @@ typedef struct KernelIntegrator {
 	int ao_bounces;
 
 	/* transparent */
-	int transparent_min_bounce;
 	int transparent_max_bounce;
 	int transparent_shadows;
 
@@ -1282,7 +1287,7 @@ typedef struct KernelIntegrator {
 	float light_inv_rr_threshold;
 
 	int start_sample;
-	int pad1, pad2, pad3;
+	int pad1;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
 
@@ -1336,18 +1341,6 @@ typedef struct KernelData {
 } KernelData;
 static_assert_align(KernelData, 16);
 
-#ifdef __KERNEL_DEBUG__
-/* NOTE: This is a runtime-only struct, alignment is not
- * really important here.
- */
-typedef ccl_addr_space struct DebugData {
-	int num_bvh_traversed_nodes;
-	int num_bvh_traversed_instances;
-	int num_bvh_intersections;
-	int num_ray_bounces;
-} DebugData;
-#endif
-
 /* Declarations required for split kernel */
 
 /* Macro for queues */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index 1e472aaf51a..d9c310a893e 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -43,7 +43,7 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
                                                        float3 *extinction)
 {
 	sd->P = P;
-	shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
+	shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
 
 	if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER)))
 		return false;
@@ -69,7 +69,7 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
                                             VolumeShaderCoefficients *coeff)
 {
 	sd->P = P;
-	shader_eval_volume(kg, sd, state, state->volume_stack, state->flag, SHADER_CONTEXT_VOLUME);
+	shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
 
 	if(!(sd->flag & (SD_ABSORPTION|SD_SCATTER|SD_EMISSION)))
 		return false;
@@ -360,7 +360,6 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
     ShaderData *sd,
     PathRadiance *L,
     ccl_addr_space float3 *throughput,
-    RNG *rng,
     bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
@@ -380,13 +379,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
 
 		/* pick random color channel, we use the Veach one-sample
 		 * model with balance heuristic for the channels */
-		float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+		float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
 		int channel = (int)(rphase*3.0f);
-		sd->randb_closure = rphase*3.0f - channel;
 
 		/* decide if we will hit or miss */
 		bool scatter = true;
-		float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
+		float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
 
 		if(probalistic_scatter) {
 			float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
@@ -439,7 +437,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
 		float3 sigma_t = coeff.sigma_a + coeff.sigma_s;
 		float3 transmittance = volume_color_transmittance(sigma_t, ray->t);
 		float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, ray->t);
-		path_radiance_accum_emission(L, *throughput, emission, state->bounce);
+		path_radiance_accum_emission(L, state, *throughput, emission);
 	}
 
 	/* modify throughput */
@@ -468,8 +466,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
     Ray *ray,
     ShaderData *sd,
     PathRadiance *L,
-    ccl_addr_space float3 *throughput,
-    RNG *rng)
+    ccl_addr_space float3 *throughput)
 {
 	float3 tp = *throughput;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -485,10 +482,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
-	float xi = path_state_rng_1D_for_decision(kg, rng, state, PRNG_SCATTER_DISTANCE);
-	float rphase = path_state_rng_1D_for_decision(kg, rng, state, PRNG_PHASE);
+	float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+	float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
 	int channel = (int)(rphase*3.0f);
-	sd->randb_closure = rphase*3.0f - channel;
 	bool has_scatter = false;
 
 	for(int i = 0; i < max_steps; i++) {
@@ -560,7 +556,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 			/* integrate emission attenuated by absorption */
 			if(L && (closure_flag & SD_EMISSION)) {
 				float3 emission = kernel_volume_emission_integrate(&coeff, closure_flag, transmittance, dt);
-				path_radiance_accum_emission(L, tp, emission, state->bounce);
+				path_radiance_accum_emission(L, state, tp, emission);
 			}
 
 			/* modify throughput */
@@ -610,15 +606,14 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(
     Ray *ray,
     PathRadiance *L,
     ccl_addr_space float3 *throughput,
-    RNG *rng,
     bool heterogeneous)
 {
 	shader_setup_from_volume(kg, sd, ray);
 
 	if(heterogeneous)
-		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput, rng);
+		return kernel_volume_integrate_heterogeneous_distance(kg, state, ray, sd, L, throughput);
 	else
-		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true);
+		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
 }
 
 #ifndef __SPLIT_KERNEL__
@@ -846,7 +841,6 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 	/* pick random color channel, we use the Veach one-sample
 	 * model with balance heuristic for the channels */
 	int channel = (int)(rphase*3.0f);
-	sd->randb_closure = rphase*3.0f - channel;
 	float xi = rscatter;
 
 	/* probabilistic scattering decision based on transmittance */
@@ -1000,8 +994,8 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
 		}
 	}
-	if(sample_t < 1e-6f || pdf == 0.0f) {
-		return VOLUME_PATH_SCATTERED;
+	if(sample_t < 0.0f || pdf == 0.0f) {
+		return VOLUME_PATH_MISSED;
 	}
 
 	/* compute transmittance up to this step */
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 28fc5ce1c30..0c11158e8da 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
-{
-	return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
-}
-
-ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
-{
-	return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
-{
-	return ray_index / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
-{
-	uint total_work_size = kernel_total_work_size(kg);
-	uint num_pools = kernel_num_work_pools(kg);
-
-	if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
-		return 0;
-	}
-
-	uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
-
-	uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
-	if(work_pool < remainder / WORK_POOL_SIZE) {
-		work_size += WORK_POOL_SIZE;
-	}
-	else if(work_pool == remainder / WORK_POOL_SIZE) {
-		work_size += remainder % WORK_POOL_SIZE;
-	}
-
-	return work_size;
-}
-
-ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
-{
-	uint num_pools = kernel_num_work_pools(kg);
-	uint pool = work_pool_from_ray_index(kg, ray_index);
-
-	return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
-	       + (pool * WORK_POOL_SIZE)
-	       + (work_index % WORK_POOL_SIZE);
-}
-
 /* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
+ccl_device bool get_next_work(KernelGlobals *kg,
+                              uint thread_index,
+                              ccl_private uint *global_work_index)
 {
-	uint work_pool = work_pool_from_ray_index(kg, ray_index);
-	uint pool_size = work_pool_work_size(kg, work_pool);
+	uint total_work_size = kernel_split_params.w
+	                     * kernel_split_params.h
+	                     * kernel_split_params.num_samples;
 
-	if(pool_size == 0) {
+	/* With a small amount of work there may be more threads than work due to
+	 * rounding up of global size, stop such threads immediately. */
+	if(thread_index >= total_work_size) {
 		return false;
 	}
 
-	*work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
-	return (*work_index < pool_size);
-}
+	/* Increase atomic work index counter in pool. */
+	uint pool = thread_index / WORK_POOL_SIZE;
+	uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]);
 
-/* This function assumes that the passed `work` is valid. */
-/* Decode sample number w.r.t. assigned `work`. */
-ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
-{
-	return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
-}
+	/* Map per-pool work index to a global work index. */
+	uint global_size = ccl_global_size(0) * ccl_global_size(1);
+	kernel_assert(global_size % WORK_POOL_SIZE == 0);
+	kernel_assert(thread_index < global_size);
 
-/* Decode pixel and tile position w.r.t. assigned `work`. */
-ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
-                             ccl_private uint *pixel_x,
-                             ccl_private uint *pixel_y,
-                             ccl_private uint *tile_x,
-                             ccl_private uint *tile_y,
-                             uint work_index,
-                             uint ray_index)
-{
-	uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+	*global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+	                   + (pool * WORK_POOL_SIZE)
+	                   + (work_index % WORK_POOL_SIZE);
 
-	*tile_x = pixel_index % kernel_split_params.w;
-	*tile_y = pixel_index / kernel_split_params.w;
+	/* Test if all work for this pool is done. */
+	return (*global_work_index < total_work_size);
+}
 
-	*pixel_x = *tile_x + kernel_split_params.x;
-	*pixel_y = *tile_y + kernel_split_params.y;
+/* Map global work index to pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(KernelGlobals *kg,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
+{
+	uint tile_pixels = kernel_split_params.w * kernel_split_params.h;
+	uint sample_offset = global_work_index / tile_pixels;
+	uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+	uint y_offset = pixel_offset / kernel_split_params.w;
+	uint x_offset = pixel_offset - y_offset * kernel_split_params.w;
+
+	*x = kernel_split_params.x + x_offset;
+	*y = kernel_split_params.y + y_offset;
+	*sample = kernel_split_params.start_sample + sample_offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
index 1a7b2040da1..254025be4e2 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -25,6 +25,7 @@
 #else
 /* SSE optimization disabled for now on 32 bit, see bug #36316 */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
 #    define __KERNEL_SSSE3__
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
index 9fa39dc9ebb..7ae205b7e14 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -81,8 +81,13 @@
 #  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
 #endif
 
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
+/* For split kernel using all registers seems fastest for now, but this
+ * is unlikely to be optimal once we resolve other bottlenecks. */
+
+#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
 
 #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
 	__launch_bounds__( \
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 628891b1458..e97e87285a5 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -90,7 +90,7 @@ kernel_cuda_path_trace_data_init(
 
 #define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
 	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
 	kernel_cuda_##name() \
 	{ \
 		kernel_##name(NULL); \
@@ -98,7 +98,7 @@ kernel_cuda_path_trace_data_init(
 
 #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
 	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
 	kernel_cuda_##name() \
 	{ \
 		ccl_local type locals; \
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index ba53ba4b26f..f015ac47d8a 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -235,7 +235,7 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
 }
 
 __kernel void kernel_ocl_filter_finalize(int w,
-	                                     int h,
+                                         int h,
                                          ccl_global float *buffer,
                                          ccl_global int *rank,
                                          ccl_global float *XtWX,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 078acc1631e..b7108f3d0f8 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -52,9 +52,7 @@ __kernel void kernel_ocl_path_trace(
 	ccl_global float *buffer,
 	ccl_global uint *rng_state,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -63,9 +61,8 @@ __kernel void kernel_ocl_path_trace(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
@@ -82,9 +79,7 @@ __kernel void kernel_ocl_shader(
 	ccl_global float4 *output,
 	ccl_global float *output_luma,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int type, int sx, int sw, int offset, int sample)
 {
@@ -92,9 +87,8 @@ __kernel void kernel_ocl_shader(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 
@@ -114,9 +108,7 @@ __kernel void kernel_ocl_bake(
 	ccl_global uint4 *input,
 	ccl_global float4 *output,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	int type, int filter, int sx, int sw, int offset, int sample)
 {
@@ -124,9 +116,8 @@ __kernel void kernel_ocl_bake(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 
@@ -144,9 +135,7 @@ __kernel void kernel_ocl_convert_to_byte(
 	ccl_global uchar4 *rgba,
 	ccl_global float *buffer,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -155,9 +144,8 @@ __kernel void kernel_ocl_convert_to_byte(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
@@ -171,9 +159,7 @@ __kernel void kernel_ocl_convert_to_half_float(
 	ccl_global uchar4 *rgba,
 	ccl_global float *buffer,
 
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name,
-#include "kernel/kernel_textures.h"
+	KERNEL_BUFFER_PARAMS,
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -182,9 +168,8 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 	kg->data = data;
 
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
@@ -193,7 +178,7 @@ __kernel void kernel_ocl_convert_to_half_float(
 		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset)
+__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
 {
 	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
 
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 8b85d362f8a..95b35e40a45 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -25,11 +25,7 @@ __kernel void kernel_ocl_path_trace_data_init(
         int num_elements,
         ccl_global char *ray_state,
         ccl_global uint *rng_state,
-
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "kernel/kernel_textures.h"
-
+		KERNEL_BUFFER_PARAMS,
         int start_sample,
         int end_sample,
         int sx, int sy, int sw, int sh, int offset, int stride,
@@ -46,10 +42,7 @@ __kernel void kernel_ocl_path_trace_data_init(
 	                 num_elements,
 	                 ray_state,
 	                 rng_state,
-
-#define KERNEL_TEX(type, ttype, name) name,
-#include "kernel/kernel_textures.h"
-
+	                 KERNEL_BUFFER_ARGS,
 	                 start_sample,
 	                 end_sample,
 	                 sx, sy, sw, sh, offset, stride,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
index 651addb02f4..4cbda1bc2e7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_compat_opencl.h"  // PRECOMPILED
+#include "kernel/split/kernel_split_common.h"  // PRECOMPILED
+
 #include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
 #include "kernel/kernels/opencl/kernel_data_init.cl"
 #include "kernel/kernels/opencl/kernel_path_init.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
index f1e914a70d4..591c3846ef2 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
@@ -25,9 +25,7 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
 		ccl_global char *ray_state,
 		ccl_global uint *rng_state,
 
-#define KERNEL_TEX(type, ttype, name) \
-		ccl_global type *name,
-#include "kernel/kernel_textures.h"
+		KERNEL_BUFFER_PARAMS,
 
 		ccl_global int *queue_index,
 		ccl_global char *use_queues_flag,
@@ -52,12 +50,9 @@ __kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace, KERNEL_NAME)(
 
 		split_data_init(kg, &kernel_split_state, ccl_global_size(0)*ccl_global_size(1), split_data_buffer, ray_state);
 
-#define KERNEL_TEX(type, ttype, name) \
-		kg->name = name;
-#include "kernel/kernel_textures.h"
 	}
 
-	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
 
 	KERNEL_NAME_EVAL(kernel, KERNEL_NAME)(
 			kg
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 02c083a83f8..9585d9f4825 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -86,7 +86,7 @@ struct OSLThreadData {
 	OSL::ShaderGlobals globals;
 	OSL::PerThreadInfo *osl_thread_info;
 	OSLTraceData tracedata;
-	OSL::ShadingContext *context[SHADER_CONTEXT_NUM];
+	OSL::ShadingContext *context;
 	OIIO::TextureSystem::Perthread *oiio_thread_info;
 };
 
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 1535496c73d..8ad2e12b067 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -1197,8 +1197,9 @@ bool OSLRenderServices::trace(TraceOpt &options, OSL::ShaderGlobals *sg,
 	tracedata->init = true;
 	tracedata->sd.osl_globals = sd->osl_globals;
 
-	/* raytrace */
-	return scene_intersect(sd->osl_globals, ray, PATH_RAY_ALL_VISIBILITY, &tracedata->isect, NULL, 0.0f, 0.0f);
+	/* Raytrace, leaving out shadow opaque to avoid early exit. */
+	uint visibility = PATH_RAY_ALL_VISIBILITY - PATH_RAY_SHADOW_OPAQUE;
+	return scene_intersect(sd->osl_globals, ray, visibility, &tracedata->isect, NULL, 0.0f, 0.0f);
 }
 
 
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 13b19d86eca..9a37e0987aa 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -57,9 +57,7 @@ void OSLShader::thread_init(KernelGlobals *kg, KernelGlobals *kernel_globals, OS
 	tdata->globals.tracedata = &tdata->tracedata;
 	tdata->globals.flipHandedness = false;
 	tdata->osl_thread_info = ss->create_thread_info();
-
-	for(int i = 0; i < SHADER_CONTEXT_NUM; i++)
-		tdata->context[i] = ss->get_context(tdata->osl_thread_info);
+	tdata->context = ss->get_context(tdata->osl_thread_info);
 
 	tdata->oiio_thread_info = osl_globals->ts->get_perthread_info();
 
@@ -74,9 +72,7 @@ void OSLShader::thread_free(KernelGlobals *kg)
 
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSLThreadData *tdata = kg->osl_tdata;
-
-	for(int i = 0; i < SHADER_CONTEXT_NUM; i++)
-		ss->release_context(tdata->context[i]);
+	ss->release_context(tdata->context);
 
 	ss->destroy_thread_info(tdata->osl_thread_info);
 
@@ -173,7 +169,7 @@ static void flatten_surface_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -182,7 +178,7 @@ void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state
 	/* execute shader for this point */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	/* automatic bump shader */
@@ -274,7 +270,7 @@ static void flatten_background_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -283,7 +279,7 @@ void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *st
 	/* execute shader for this point */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 
 	if(kg->osl->background_state) {
 		ss->execute(octx, *(kg->osl->background_state), *globals);
@@ -329,7 +325,7 @@ static void flatten_volume_closure_tree(ShaderData *sd,
 	}
 }
 
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx)
+void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -338,7 +334,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 	/* execute shader */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	if(kg->osl->volume_state[shader]) {
@@ -352,7 +348,7 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 
 /* Displacement */
 
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx)
+void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd)
 {
 	/* setup shader globals from shader data */
 	OSLThreadData *tdata = kg->osl_tdata;
@@ -364,7 +360,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderConte
 	/* execute shader */
 	OSL::ShadingSystem *ss = (OSL::ShadingSystem*)kg->osl_ss;
 	OSL::ShaderGlobals *globals = &tdata->globals;
-	OSL::ShadingContext *octx = tdata->context[(int)ctx];
+	OSL::ShadingContext *octx = tdata->context;
 	int shader = sd->shader & SHADER_MASK;
 
 	if(kg->osl->displacement_state[shader]) {
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index 32121e940b4..f7020d1223d 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -53,10 +53,10 @@ public:
 	static void thread_free(KernelGlobals *kg);
 
 	/* eval */
-	static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, ShaderContext ctx);
-	static void eval_displacement(KernelGlobals *kg, ShaderData *sd, ShaderContext ctx);
+	static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
+	static void eval_displacement(KernelGlobals *kg, ShaderData *sd);
 
 	/* attributes */
 	static int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 2bb981c3918..6870d479af3 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -76,8 +76,8 @@ shader node_principled_bsdf(
 		float aspect = sqrt(1.0 - Anisotropic * 0.9);
 		float r2 = Roughness * Roughness;
 
-		float alpha_x = max(0.001, r2 / aspect);
-		float alpha_y = max(0.001, r2 * aspect);
+		float alpha_x = r2 / aspect;
+		float alpha_y = r2 * aspect;
 
 		color tmp_col = color(1.0, 1.0, 1.0) * (1.0 - SpecularTint) + m_ctint * SpecularTint;
 
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
index e2762a85fc8..2313feac089 100644
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -87,7 +87,6 @@ ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals
 	PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
 
 	path_radiance_init(inactive_L, kernel_data.film.use_light_pass);
-	inactive_L->direct_throughput = L->direct_throughput;
 	path_radiance_copy_indirect(inactive_L, L);
 
 	ray_state[inactive_ray] = RAY_REGENERATED;
@@ -110,7 +109,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
 
 	ShaderData *sd = saved_sd;
-	RNG rng = kernel_split_state.rng[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	float3 throughput = branched_state->throughput;
 	ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
@@ -157,37 +155,38 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 		num_samples = ceil_to_int(num_samples_adjust*num_samples);
 
 		float num_samples_inv = num_samples_adjust/num_samples;
-		RNG bsdf_rng = cmj_hash(rng, i);
 
 		for(int j = branched_state->next_sample; j < num_samples; j++) {
 			if(reset_path_state) {
 				*ps = branched_state->path_state;
 			}
 
+			ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
+
 			ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
 			*tp = throughput;
 
 			ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
 
 			if(!kernel_branched_path_surface_bounce(kg,
-			                                        &bsdf_rng,
 			                                        sd,
 			                                        sc,
 			                                        j,
 			                                        num_samples,
 			                                        tp,
 			                                        ps,
-			                                        L,
+			                                        &L->state,
 			                                        bsdf_ray,
 			                                        sum_sample_weight))
 			{
 				continue;
 			}
 
+			ps->rng_hash = branched_state->path_state.rng_hash;
+
 			/* update state for next iteration */
 			branched_state->next_closure = i;
 			branched_state->next_sample = j+1;
-			branched_state->num_samples = num_samples;
 
 			/* start the indirect path */
 			*tp *= num_samples_inv;
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 4c1fdd2d69c..c9e7deddafa 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -75,92 +75,59 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 
-	ccl_global uint *rng_state = kernel_split_params.rng_state;
 	int stride = kernel_split_params.stride;
 
 	ccl_global char *ray_state = kernel_split_state.ray_state;
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
-#endif
 	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
-	ccl_global float *buffer = kernel_split_params.buffer;
-
-	unsigned int work_index;
-	ccl_global uint *initial_rng;
-
-	unsigned int sample;
-	unsigned int tile_x;
-	unsigned int tile_y;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-
-	work_index = kernel_split_state.work_array[ray_index];
-	sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
-	                        &tile_x, &tile_y,
-	                        work_index,
-	                        ray_index);
-	initial_rng = rng_state;
-
-	rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
-	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-#ifdef __KERNEL_DEBUG__
-		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
-#endif
+		uint sample = state->sample;
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
 
 		/* accumulate result in output buffer */
-		bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
-		kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
-
-		path_rng_end(kg, rng_state, rng);
+		kernel_write_result(kg, buffer, sample, L);
 
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 		/* We have completed current work; So get next work */
-		int valid_work = get_next_work(kg, &work_index, ray_index);
-		if(!valid_work) {
+		uint work_index;
+		if(!get_next_work(kg, ray_index, &work_index)) {
 			/* If work is invalid, this means no more work is available and the thread may exit */
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-			kernel_split_state.work_array[ray_index] = work_index;
-			/* Get the sample associated with the current work */
-			sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-			/* Get pixel and tile position associated with current work */
-			get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
+			uint x, y, sample;
+			get_work_pixel(kg, work_index, &x, &y, &sample);
 
-			/* Remap rng_state according to the current work */
-			rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
-			/* Remap buffer according to the current work */
-			buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+			/* Remap rng_state to current pixel. */
+			ccl_global uint *rng_state = kernel_split_params.rng_state;
+			rng_state += kernel_split_params.offset + x + y*stride;
+
+			/* Store buffer offset for writing to passes. */
+			uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride;
+			kernel_split_state.buffer_offset[ray_index] = buffer_offset;
 
 			/* Initialize random numbers and ray. */
-			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng, ray);
+			uint rng_hash;
+			kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, ray);
 
 			if(ray->t != 0.0f) {
-				/* Initialize throughput, L_transparent, Ray, PathState;
+				/* Initialize throughput, path radiance, Ray, PathState;
 				 * These rays proceed with path-iteration.
 				 */
 				*throughput = make_float3(1.0f, 1.0f, 1.0f);
-				*L_transparent = 0.0f;
 				path_radiance_init(L, kernel_data.film.use_light_pass);
-				path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &rng, sample, ray);
+				path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng_hash, sample, ray);
 #ifdef __SUBSURFACE__
 				kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
 #endif
-#ifdef __KERNEL_DEBUG__
-				debug_data_init(debug_data);
-#endif
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 				enqueue_flag = 1;
 			}
@@ -168,14 +135,13 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 				/* These rays do not participate in path-iteration. */
 				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				/* Accumulate result in output buffer. */
+				ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
 				kernel_write_pass_float4(buffer, sample, L_rad);
-				path_rng_end(kg, rng_state, rng);
 
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 			}
 		}
 	}
-	kernel_split_state.rng[ray_index] = rng;
 
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index e4545d66eff..2c042dfde6f 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -52,9 +52,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
         ccl_global uint *rng_state,
 
 #ifdef __KERNEL_OPENCL__
-#define KERNEL_TEX(type, ttype, name)                                   \
-        ccl_global type *name,
-#include "kernel/kernel_textures.h"
+		KERNEL_BUFFER_PARAMS,
 #endif
 
         int start_sample,
@@ -100,9 +98,8 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 	split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
 
 #ifdef __KERNEL_OPENCL__
-#define KERNEL_TEX(type, ttype, name) \
-	kg->name = name;
-#include "kernel/kernel_textures.h"
+	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
+	kernel_set_buffer_info(kg);
 #endif
 
 	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
@@ -127,14 +124,25 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 
 	/* zero the tiles pixels and initialize rng_state if this is the first sample */
 	if(start_sample == 0) {
-		parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
-			int pixel = i / kernel_data.film.pass_stride;
-			int pass = i % kernel_data.film.pass_stride;
+		int pass_stride = kernel_data.film.pass_stride;
+
+#ifdef __KERNEL_CPU__
+		for(int y = sy; y < sy + sh; y++) {
+			int index = offset + y * stride;
+			memset(buffer + (sx + index) * pass_stride, 0, sizeof(float) * pass_stride * sw);
+			for(int x = sx; x < sx + sw; x++) {
+				rng_state[index + x] = hash_int_2d(x, y);
+			}
+		}
+#else
+		parallel_for(kg, i, sw * sh * pass_stride) {
+			int pixel = i / pass_stride;
+			int pass = i % pass_stride;
 
 			int x = sx + pixel % sw;
 			int y = sy + pixel / sw;
 
-			int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
+			int index = (offset + x + y*stride) * pass_stride + pass;
 
 			*(buffer + index) = 0.0f;
 		}
@@ -146,6 +154,7 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 			int index = (offset + x + y*stride);
 			*(rng_state + index) = hash_int_2d(x, y);
 		}
+#endif
 	}
 
 #endif  /* KERENL_STUB */
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 3336c968a44..2aac66ecb84 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -62,8 +62,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 
 		/* direct lighting */
 #ifdef __EMISSION__
-		RNG rng = kernel_split_state.rng[ray_index];
-
 		bool flag = (kernel_data.integrator.use_direct_light &&
 		             (sd->flag & SD_BSDF_HAS_EVAL));
 
@@ -83,23 +81,20 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 
 		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
-			float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
 			float light_u, light_v;
-			path_state_rng_2D(kg, &rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-			float terminate = path_state_rng_light_termination(kg, &rng, state);
+			path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
+			float terminate = path_state_rng_light_termination(kg, state);
 
 			LightSample ls;
 			if(light_sample(kg,
-			                light_t, light_u, light_v,
+			                light_u, light_v,
 			                sd->time,
 			                sd->P,
 			                state->bounce,
 			                &ls)) {
 
 				Ray light_ray;
-#  ifdef __OBJECT_MOTION__
 				light_ray.time = sd->time;
-#  endif
 
 				BsdfEval L_light;
 				bool is_lamp;
@@ -115,7 +110,6 @@ ccl_device void kernel_direct_lighting(KernelGlobals *kg,
 				}
 			}
 		}
-		kernel_split_state.rng[ray_index] = rng;
 #endif  /* __EMISSION__ */
 	}
 
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
index 9f8dd2392d9..491487f1230 100644
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -30,7 +30,6 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
 	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
 
 	ShaderData *sd = &kernel_split_state.sd[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
@@ -58,22 +57,21 @@ ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(K
 
 		/* integrate along volume segment with distance sampling */
 		VolumeIntegrateResult result = kernel_volume_integrate(
-			kg, ps, sd, &volume_ray, L, tp, &rng, heterogeneous);
+			kg, ps, sd, &volume_ray, L, tp, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
 		if(result == VOLUME_PATH_SCATTERED) {
 			/* direct lighting */
-			kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *tp, &branched_state->path_state, L);
+			kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
 
 			/* indirect light bounce */
-			if(!kernel_path_volume_bounce(kg, &rng, sd, tp, ps, L, pray)) {
+			if(!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
 				continue;
 			}
 
 			/* start the indirect path */
 			branched_state->next_closure = 0;
 			branched_state->next_sample = j+1;
-			branched_state->num_samples = num_samples;
 
 			/* Attempting to share too many samples is slow for volumes as it causes us to
 			 * loop here more and have many calls to kernel_volume_integrate which evaluates
@@ -141,7 +139,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
 		ShaderData *sd = &kernel_split_state.sd[ray_index];
 		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -165,15 +162,15 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 				{
 					/* integrate along volume segment with distance sampling */
 					VolumeIntegrateResult result = kernel_volume_integrate(
-						kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+						kg, state, sd, &volume_ray, L, throughput, heterogeneous);
 
 #  ifdef __VOLUME_SCATTER__
 					if(result == VOLUME_PATH_SCATTERED) {
 						/* direct lighting */
-						kernel_path_volume_connect_light(kg, &rng, sd, emission_sd, *throughput, state, L);
+						kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
 
 						/* indirect light bounce */
-						if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+						if(kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
 							ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
 						}
 						else {
@@ -194,8 +191,6 @@ ccl_device void kernel_do_volume(KernelGlobals *kg)
 			}
 #  endif  /* __BRANCHED_PATH__ */
 		}
-
-		kernel_split_state.rng[ray_index] = rng;
 	}
 
 #  ifdef __BRANCHED_PATH__
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 670a557f084..dffd291012d 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -90,163 +90,58 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	if(ray_index != QUEUE_EMPTY_SLOT) {
 #endif
 
-	int stride = kernel_split_params.stride;
-
-	unsigned int work_index;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-
-	unsigned int tile_x;
-	unsigned int tile_y;
-	unsigned int sample;
-
-	RNG rng = kernel_split_state.rng[ray_index];
 	ccl_global PathState *state = 0x0;
 	float3 throughput;
 
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 	ShaderData *sd = &kernel_split_state.sd[ray_index];
-	ccl_global float *buffer = kernel_split_params.buffer;
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+		ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
+
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 
 		throughput = kernel_split_state.throughput[ray_index];
 		state = &kernel_split_state.path_state[ray_index];
 
-		work_index = kernel_split_state.work_array[ray_index];
-		sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-		get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        work_index,
-		                        ray_index);
-
-		buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
-
-#ifdef __SHADOW_TRICKS__
-		if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
-			if(state->flag & PATH_RAY_CAMERA) {
-				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
-				state->catcher_object = sd->object;
-				if(!kernel_data.background.transparent) {
-					PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-					ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-					L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
-				}
-			}
-		}
-		else {
-			state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
-		}
-#endif  /* __SHADOW_TRICKS__ */
-
-		/* holdout */
-#ifdef __HOLDOUT__
-		if(((sd->flag & SD_HOLDOUT) ||
-		    (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
-		   (state->flag & PATH_RAY_CAMERA))
+		if(!kernel_path_shader_apply(kg,
+		                             sd,
+		                             state,
+		                             ray,
+		                             throughput,
+		                             emission_sd,
+		                             L,
+		                             buffer))
 		{
-			if(kernel_data.background.transparent) {
-				float3 holdout_weight;
-				if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
-				}
-				else {
-					holdout_weight = shader_holdout_eval(kg, sd);
-				}
-				/* any throughput is ok, should all be identical here */
-				kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
-			}
-			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
-				kernel_split_path_end(kg, ray_index);
-			}
+			kernel_split_path_end(kg, ray_index);
 		}
-#endif  /* __HOLDOUT__ */
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-#ifdef __BRANCHED_PATH__
-		if(!IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT))
-#endif  /* __BRANCHED_PATH__ */
-		{
-			/* Holdout mask objects do not write data passes. */
-			kernel_write_data_passes(kg,
-				                     buffer,
-				                     L,
-				                     sd,
-				                     sample,
-				                     state,
-				                     throughput);
-		}
-
-		/* Blurring of bsdf after bounces, for rays that have a small likelihood
-		 * of following this particular path (diffuse, rough glossy.
-		 */
-#ifndef __BRANCHED_PATH__
-		if(kernel_data.integrator.filter_glossy != FLT_MAX)
-#else
-		if(kernel_data.integrator.filter_glossy != FLT_MAX &&
-		   (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)))
-#endif  /* __BRANCHED_PATH__ */
-		{
-			float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
-			if(blur_pdf < 1.0f) {
-				float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
-				shader_bsdf_blur(kg, sd, blur_roughness);
-			}
-		}
-
-#ifdef __EMISSION__
-		/* emission */
-		if(sd->flag & SD_EMISSION) {
-			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
-			float3 emission = indirect_primitive_emission(
-			        kg,
-			        sd,
-			        kernel_split_state.isect[ray_index].t,
-			        state->flag,
-			        state->ray_pdf);
-			path_radiance_accum_emission(L, throughput, emission, state->bounce);
-		}
-#endif  /* __EMISSION__ */
-
 		/* Path termination. this is a strange place to put the termination, it's
 		 * mainly due to the mixed in MIS that we use. gives too many unneeded
 		 * shader evaluations, only need emission if we are going to terminate.
 		 */
-#ifndef __BRANCHED_PATH__
-		float probability = path_state_terminate_probability(kg, state, throughput);
-#else
-		float probability = 1.0f;
-
-		if(!kernel_data.integrator.branched) {
-			probability = path_state_terminate_probability(kg, state, throughput);
-		}
-		else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-			int num_samples = kernel_split_state.branched_state[ray_index].num_samples;
-			probability = path_state_terminate_probability(kg, state, throughput*num_samples);
-		}
-		else if(state->flag & PATH_RAY_TRANSPARENT) {
-			probability = path_state_terminate_probability(kg, state, throughput);
-		}
-#endif
+		float probability = path_state_continuation_probability(kg, state, throughput);
 
 		if(probability == 0.0f) {
 			kernel_split_path_end(kg, ray_index);
 		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-			if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
-				if(terminate >= probability) {
-					kernel_split_path_end(kg, ray_index);
-				}
-				else {
-					kernel_split_state.throughput[ray_index] = throughput/probability;
-				}
+		else if(probability < 1.0f) {
+			float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
+			if(terminate >= probability) {
+				kernel_split_path_end(kg, ray_index);
 			}
+			else {
+				kernel_split_state.throughput[ray_index] = throughput/probability;
+			}
+		}
 
+		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 			kernel_update_denoising_features(kg, sd, state, L);
 		}
 	}
@@ -260,8 +155,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	}
 #endif  /* __AO__ */
 
-	kernel_split_state.rng[ray_index] = rng;
-
 #ifndef __COMPUTE_DEVICE_GPU__
 	}
 #endif
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
index f0ebb90f60a..437043a5971 100644
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -33,7 +33,7 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 		if(ray_index != QUEUE_EMPTY_SLOT) {
 			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-				if(state->bounce > kernel_data.integrator.ao_bounces) {
+				if(path_state_ao_bounce(kg, state)) {
 					kernel_split_path_end(kg, ray_index);
 				}
 			}
@@ -50,33 +50,16 @@ ccl_device void kernel_indirect_background(KernelGlobals *kg)
 		return;
 	}
 
-	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
-
 	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		/* eval background shader if nothing hit */
-		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
-			*L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
-			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-				kernel_split_path_end(kg, ray_index);
-		}
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
-		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
-			path_radiance_accum_background(L, state, (*throughput), L_background);
-#endif
-			kernel_split_path_end(kg, ray_index);
-		}
+		kernel_path_background(kg, state, ray, throughput, emission_sd, L);
+		kernel_split_path_end(kg, ray_index);
 	}
-
-
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
index 82bc2f01fd7..e9fe5552e8c 100644
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -54,7 +54,6 @@ ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
 #endif
 		if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
 			ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-			kernel_path_subsurface_accum_indirect(ss_indirect, L);
 
 			/* Trace indirect subsurface rays by restarting the loop. this uses less
 			 * stack memory than invoking kernel_path_indirect.
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index c669d79ddcd..448456d167d 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -57,27 +57,10 @@ ccl_device void kernel_lamp_emission(KernelGlobals *kg)
 
 		float3 throughput = kernel_split_state.throughput[ray_index];
 		Ray ray = kernel_split_state.ray[ray_index];
+		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+		ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
-#ifdef __LAMP_MIS__
-		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-			/* ray starting from previous non-transparent bounce */
-			Ray light_ray;
-
-			light_ray.P = ray.P - state->ray_t*ray.D;
-			state->ray_t += kernel_split_state.isect[ray_index].t;
-			light_ray.D = ray.D;
-			light_ray.t = state->ray_t;
-			light_ray.time = ray.time;
-			light_ray.dD = ray.dD;
-			light_ray.dP = ray.dP;
-			/* intersect with lamp */
-			float3 emission;
-
-			if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) {
-				path_radiance_accum_emission(L, throughput, emission, state->bounce);
-			}
-		}
-#endif  /* __LAMP_MIS__ */
+		kernel_path_lamp_emission(kg, state, &ray, throughput, isect, emission_sd, L);
 	}
 }
 
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 7758e35fd32..c3373174582 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -126,7 +126,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 	if(active) {
 		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
 		ShaderData *sd = &kernel_split_state.sd[ray_index];
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -135,7 +134,7 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
 #endif
 			/* Compute direct lighting and next bounce. */
-			if(!kernel_path_surface_bounce(kg, &rng, sd, throughput, state, L, ray)) {
+			if(!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
 				kernel_split_path_end(kg, ray_index);
 			}
 #ifdef __BRANCHED_PATH__
@@ -157,8 +156,6 @@ ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
 			}
 		}
 #endif  /* __BRANCHED_PATH__ */
-
-		kernel_split_state.rng[ray_index] = rng;
 	}
 
 	/* Enqueue RAY_UPDATE_BUFFER rays. */
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
index a7ecde7c80d..0ab2289348b 100644
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -29,77 +29,59 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 	 */
 	kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
 
-	unsigned int my_sample;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-	unsigned int tile_x;
-	unsigned int tile_y;
-
-	unsigned int work_index = 0;
 	/* Get work. */
-	if(!get_next_work(kg, &work_index, ray_index)) {
+	uint work_index;
+	if(!get_next_work(kg, ray_index, &work_index)) {
 		/* No more work, mark ray as inactive */
 		kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
 
 		return;
 	}
 
-	/* Get the sample associated with the work. */
-	my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-
-	/* Get pixel and tile position associated with the work. */
-	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
-	                             &tile_x, &tile_y,
-	                             work_index,
-	                             ray_index);
-	kernel_split_state.work_array[ray_index] = work_index;
+	uint x, y, sample;
+	get_work_pixel(kg, work_index, &x, &y, &sample);
 
+	/* Remap rng_state and buffer to current pixel. */
 	ccl_global uint *rng_state = kernel_split_params.rng_state;
-	rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
-
-	ccl_global float *buffer = kernel_split_params.buffer;
-	buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
+	rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride;
 
-	RNG rng = kernel_split_state.rng[ray_index];
+	/* Store buffer offset for writing to passes. */
+	uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride;
+	kernel_split_state.buffer_offset[ray_index] = buffer_offset;
 
 	/* Initialize random numbers and ray. */
+	uint rng_hash;
 	kernel_path_trace_setup(kg,
 	                        rng_state,
-	                        my_sample,
-	                        pixel_x, pixel_y,
-	                        &rng,
+	                        sample,
+	                        x, y,
+	                        &rng_hash,
 	                        &kernel_split_state.ray[ray_index]);
 
 	if(kernel_split_state.ray[ray_index].t != 0.0f) {
-		/* Initialize throughput, L_transparent, Ray, PathState;
+		/* Initialize throughput, path radiance, Ray, PathState;
 		 * These rays proceed with path-iteration.
 		 */
 		kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-		kernel_split_state.L_transparent[ray_index] = 0.0f;
 		path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
 		path_state_init(kg,
 		                &kernel_split_state.sd_DL_shadow[ray_index],
 		                &kernel_split_state.path_state[ray_index],
-		                &rng,
-		                my_sample,
+		                rng_hash,
+		                sample,
 		                &kernel_split_state.ray[ray_index]);
 #ifdef __SUBSURFACE__
 		kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
 #endif
-
-#ifdef __KERNEL_DEBUG__
-		debug_data_init(&kernel_split_state.debug_data[ray_index]);
-#endif
 	}
 	else {
 		/* These rays do not participate in path-iteration. */
 		float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 		/* Accumulate result in output buffer. */
-		kernel_write_pass_float4(buffer, my_sample, L_rad);
-		path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
+		ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
+		kernel_write_pass_float4(buffer, sample, L_rad);
 		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
 	}
-	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 45984ca509b..f5378bc172b 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -59,52 +59,14 @@ ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 		return;
 	}
 
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
-#endif
-	Intersection isect;
-	PathState state = kernel_split_state.path_state[ray_index];
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 	Ray ray = kernel_split_state.ray[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 
-	/* intersect scene */
-	uint visibility = path_state_ray_visibility(kg, &state);
-
-	if(state.bounce > kernel_data.integrator.ao_bounces) {
-		visibility = PATH_RAY_SHADOW;
-		ray.t = kernel_data.background.ao_distance;
-	}
-
-#ifdef __HAIR__
-	float difl = 0.0f, extmax = 0.0f;
-	uint lcg_state = 0;
-	RNG rng = kernel_split_state.rng[ray_index];
-
-	if(kernel_data.bvh.have_curves) {
-		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
-			float3 pixdiff = ray.dD.dx + ray.dD.dy;
-			/*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
-			difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
-		}
-
-		extmax = kernel_data.curve.maximum_width;
-		lcg_state = lcg_state_init(&rng, state.rng_offset, state.sample, 0x51633e2d);
-	}
-
-	bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
-#else
-	bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
-#endif
+	Intersection isect;
+	bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L);
 	kernel_split_state.isect[ray_index] = isect;
 
-#ifdef __KERNEL_DEBUG__
-	if(state.flag & PATH_RAY_CAMERA) {
-		debug_data->num_bvh_traversed_nodes += isect.num_traversed_nodes;
-		debug_data->num_bvh_traversed_instances += isect.num_traversed_instances;
-		debug_data->num_bvh_intersections += isect.num_intersections;
-	}
-	debug_data->num_ray_bounces++;
-#endif
-
 	if(!hit) {
 		/* Change the state of rays that hit the background;
 		 * These rays undergo special processing in the
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index 2801b32f285..7032461b04a 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -48,30 +48,18 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
 
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-#ifndef __BRANCHED_PATH__
-		float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
-#else
-		ShaderContext ctx = SHADER_CONTEXT_MAIN;
-		float rbsdf = 0.0f;
-
-		if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-			rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
-
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag);
+#ifdef __BRANCHED_PATH__
+		if(kernel_data.integrator.branched) {
+			shader_merge_closures(&kernel_split_state.sd[ray_index]);
 		}
-
-		if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-			ctx = SHADER_CONTEXT_INDIRECT;
+		else
+#endif
+		{
+			shader_prepare_closures(&kernel_split_state.sd[ray_index], state);
 		}
-
-		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, ctx);
-		shader_merge_closures(&kernel_split_state.sd[ray_index]);
-#endif  /* __BRANCHED_PATH__ */
-
-		kernel_split_state.rng[ray_index] = rng;
 	}
 }
 
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
index 297decb0bc2..5a55b680695 100644
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ b/intern/cycles/kernel/split/kernel_shader_sort.h
@@ -39,7 +39,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 	ccl_local ushort *local_index = &locals->local_index[0];
 
 	/* copy to local memory */
-	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
 		uint idx = offset + i + lid;
 		uint add = input + idx;
 		uint value = (~0);
@@ -59,9 +59,9 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 #  ifdef __KERNEL_OPENCL__
 
 	/* bitonic sort */
-	for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
-		for (uint inc = length; inc > 0; inc >>= 1) {
-			for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
+	for(uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
+		for(uint inc = length; inc > 0; inc >>= 1) {
+			for(uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
 				uint i = lid + ii;
 				bool direction = ((i & (length << 1)) != 0);
 				uint j = i ^ inc;
@@ -81,7 +81,7 @@ ccl_device void kernel_shader_sort(KernelGlobals *kg,
 #  endif /* __KERNEL_OPENCL__ */
 
 	/* copy to destination */
-	for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
+	for(uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
 		uint idx = offset + i + lid;
 		uint lidx = local_index[i + lid];
 		uint outi = output + idx;
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
index 474286285a9..79aa2c9435b 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -37,21 +37,18 @@ ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
 	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
 	float3 throughput = kernel_split_state.throughput[ray_index];
 
 #ifdef __BRANCHED_PATH__
 	if(!kernel_data.integrator.branched || IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
 #endif
-		kernel_path_ao(kg, sd, emission_sd, L, state, &rng, throughput, shader_bsdf_alpha(kg, sd));
+		kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
 #ifdef __BRANCHED_PATH__
 	}
 	else {
-		kernel_branched_path_ao(kg, sd, emission_sd, L, state, &rng, throughput);
+		kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
 	}
 #endif
-
-	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
index 78e61709b01..b52f9a5eb81 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -45,7 +45,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ShaderData *sd = &kernel_split_state.sd[ray_index];
 	float3 throughput = kernel_split_state.throughput[ray_index];
-	RNG rng = kernel_split_state.rng[ray_index];
 
 	BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
 	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -75,7 +74,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 
 	if(use_branched) {
 		kernel_branched_path_surface_connect_light(kg,
-		                                           &rng,
 		                                           sd,
 		                                           emission_sd,
 		                                           state,
@@ -91,10 +89,11 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 		float3 shadow;
 
 		if(!shadow_blocked(kg,
-			               emission_sd,
-			               state,
-			               &ray,
-			               &shadow))
+		                   sd,
+		                   emission_sd,
+		                   state,
+		                   &ray,
+		                   &shadow))
 		{
 			/* accumulate */
 			path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
@@ -103,8 +102,6 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 			path_radiance_accum_total_light(L, state, throughput, &L_light);
 		}
 	}
-
-	kernel_split_state.rng[ray_index] = rng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 08f0124b529..558d327bc76 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -63,7 +63,7 @@ ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
 		PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
 
 		path_radiance_sum_indirect(L);
-		path_radiance_accum_sample(orig_ray_L, L, 1);
+		path_radiance_accum_sample(orig_ray_L, L);
 
 		atomic_fetch_and_dec_uint32((ccl_global uint*)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
 
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index 4bb2f0d3d80..c58c8463f5c 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -56,14 +56,6 @@ typedef struct SplitParams {
 
 /* SPLIT_DATA_ENTRY(type, name, num) */
 
-#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__)
-/* DebugData memory */
-#  define SPLIT_DATA_DEBUG_ENTRIES \
-	SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
-#else
-#  define SPLIT_DATA_DEBUG_ENTRIES
-#endif  /* DEBUG */
-
 #ifdef __BRANCHED_PATH__
 
 typedef ccl_global struct SplitBranchedState {
@@ -80,7 +72,6 @@ typedef ccl_global struct SplitBranchedState {
 	/* indirect loop state */
 	int next_closure;
 	int next_sample;
-	int num_samples;
 
 #ifdef __SUBSURFACE__
 	int ss_next_closure;
@@ -122,9 +113,7 @@ typedef ccl_global struct SplitBranchedState {
 #endif /* __VOLUME__ */
 
 #define SPLIT_DATA_ENTRIES \
-	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
 	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
 	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
@@ -133,19 +122,16 @@ typedef ccl_global struct SplitBranchedState {
 	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
-	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+	SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
 	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
 	SPLIT_DATA_SUBSURFACE_ENTRIES \
 	SPLIT_DATA_VOLUME_ENTRIES \
 	SPLIT_DATA_BRANCHED_ENTRIES \
-	SPLIT_DATA_DEBUG_ENTRIES \
 
 /* entries to be copied to inactive rays when sharing branched samples (TODO: which are actually needed?) */
 #define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
-	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
 	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
 	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
 	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
 	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
@@ -158,7 +144,6 @@ typedef ccl_global struct SplitBranchedState {
 	SPLIT_DATA_SUBSURFACE_ENTRIES \
 	SPLIT_DATA_VOLUME_ENTRIES \
 	SPLIT_DATA_BRANCHED_ENTRIES \
-	SPLIT_DATA_DEBUG_ENTRIES \
 
 /* struct that holds pointers to data in the shared state buffer */
 typedef struct SplitData {
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index d5083b23f80..3b957856aea 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -38,7 +38,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 	SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
 
 	ShaderData *sd = &branched_state->sd;
-	RNG rng = kernel_split_state.rng[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
 
@@ -52,14 +51,12 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 		if(branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
 		   branched_state->next_closure == 0 && branched_state->next_sample == 0)
 		{
-			branched_state->lcg_state = lcg_state_init(&rng,
-			                                           branched_state->path_state.rng_offset,
-			                                           branched_state->path_state.sample,
-			                                           0x68bc21eb);
+			branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
+			                                                     0x68bc21eb);
 		}
 		int num_samples = kernel_data.integrator.subsurface_samples;
 		float num_samples_inv = 1.0f/num_samples;
-		RNG bssrdf_rng = cmj_hash(rng, i);
+		uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
 
 		/* do subsurface scatter step with copy of shader data, this will
 		 * replace the BSSRDF with a diffuse BSDF closure */
@@ -67,7 +64,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 			ccl_global SubsurfaceIntersection *ss_isect = &branched_state->ss_isect;
 			float bssrdf_u, bssrdf_v;
 			path_branched_rng_2D(kg,
-			                     &bssrdf_rng,
+			                     bssrdf_rng_hash,
 			                     &branched_state->path_state,
 			                     j,
 			                     num_samples,
@@ -77,7 +74,7 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 
 			/* intersection is expensive so avoid doing multiple times for the same input */
 			if(branched_state->next_hit == 0 && branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-				RNG lcg_state = branched_state->lcg_state;
+				uint lcg_state = branched_state->lcg_state;
 				SubsurfaceIntersection ss_isect_private;
 
 				branched_state->num_hits = subsurface_scatter_multi_intersect(kg,
@@ -152,7 +149,6 @@ ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_it
 						int all = (kernel_data.integrator.sample_all_lights_direct) ||
 							      (branched_state->path_state.flag & PATH_RAY_SHADOW_CATCHER);
 						kernel_branched_path_surface_connect_light(kg,
-						                                           &rng,
 						                                           bssrdf_sd,
 						                                           emission_sd,
 						                                           hit_state,
@@ -229,7 +225,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-		RNG rng = kernel_split_state.rng[ray_index];
 		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 		ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
@@ -246,7 +241,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 				                                  emission_sd,
 				                                  L,
 				                                  state,
-				                                  &rng,
 				                                  ray,
 				                                  throughput,
 				                                  ss_indirect))
@@ -256,21 +250,17 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 #ifdef __BRANCHED_PATH__
 			}
 			else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-				float bssrdf_probability;
-				ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+				float bssrdf_u, bssrdf_v;
+				path_state_rng_2D(kg,
+				                  state,
+				                  PRNG_BSDF_U,
+				                  &bssrdf_u, &bssrdf_v);
 
-				/* modify throughput for picking bssrdf or bsdf */
-				*throughput *= bssrdf_probability;
+				const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
 
 				/* do bssrdf scatter step if we picked a bssrdf closure */
 				if(sc) {
-					uint lcg_state = lcg_state_init(&rng, state->rng_offset, state->sample, 0x68bc21eb);
-					float bssrdf_u, bssrdf_v;
-					path_state_rng_2D(kg,
-					                  &rng,
-					                  state,
-					                  PRNG_BSDF_U,
-					                  &bssrdf_u, &bssrdf_v);
+					uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
 					subsurface_scatter_step(kg,
 					                        sd,
 					                        state,
@@ -290,7 +280,6 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
 			}
 #endif
 		}
-		kernel_split_state.rng[ray_index] = rng;
 	}
 
 #  ifdef __BRANCHED_PATH__
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 7704aa545c8..4268813b263 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -280,8 +280,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						float aspect = safe_sqrtf(1.0f - anisotropic * 0.9f);
 						float r2 = roughness * roughness;
 
-						bsdf->alpha_x = fmaxf(0.001f, r2 / aspect);
-						bsdf->alpha_y = fmaxf(0.001f, r2 * aspect);
+						bsdf->alpha_x = r2 / aspect;
+						bsdf->alpha_y = r2 * aspect;
 
 						float m_cdlum = 0.3f * base_color.x + 0.6f * base_color.y + 0.1f * base_color.z; // luminance approx.
 						float3 m_ctint = m_cdlum > 0.0f ? base_color / m_cdlum : make_float3(0.0f, 0.0f, 0.0f); // normalize lum. to isolate hue+sat
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 8e45dbfa5ff..6d6e92e73f6 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,19 +16,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Float4 textures on various devices. */
-#if defined(__KERNEL_CPU__)
-#  define TEX_NUM_FLOAT4_IMAGES		TEX_NUM_FLOAT4_CPU
-#elif defined(__KERNEL_CUDA__)
-#  if __CUDA_ARCH__ < 300
-#    define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_CUDA
-#  else
-#    define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_CUDA_KEPLER
-#  endif
-#else
-#  define TEX_NUM_FLOAT4_IMAGES	TEX_NUM_FLOAT4_OPENCL
-#endif
-
 ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint srgb, uint use_alpha)
 {
 #ifdef __KERNEL_CPU__
@@ -50,94 +37,94 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 
 	switch(id) {
 		case 0: r = kernel_tex_image_interp(__tex_image_float4_000, x, y); break;
-		case 1: r = kernel_tex_image_interp(__tex_image_float4_001, x, y); break;
-		case 2: r = kernel_tex_image_interp(__tex_image_float4_002, x, y); break;
-		case 3: r = kernel_tex_image_interp(__tex_image_float4_003, x, y); break;
-		case 4: r = kernel_tex_image_interp(__tex_image_float4_004, x, y); break;
-		case 5: r = kernel_tex_image_interp(__tex_image_byte4_005, x, y); break;
-		case 6: r = kernel_tex_image_interp(__tex_image_byte4_006, x, y); break;
-		case 7: r = kernel_tex_image_interp(__tex_image_byte4_007, x, y); break;
-		case 8: r = kernel_tex_image_interp(__tex_image_byte4_008, x, y); break;
+		case 8: r = kernel_tex_image_interp(__tex_image_float4_008, x, y); break;
+		case 16: r = kernel_tex_image_interp(__tex_image_float4_016, x, y); break;
+		case 24: r = kernel_tex_image_interp(__tex_image_float4_024, x, y); break;
+		case 32: r = kernel_tex_image_interp(__tex_image_float4_032, x, y); break;
+		case 1: r = kernel_tex_image_interp(__tex_image_byte4_001, x, y); break;
 		case 9: r = kernel_tex_image_interp(__tex_image_byte4_009, x, y); break;
-		case 10: r = kernel_tex_image_interp(__tex_image_byte4_010, x, y); break;
-		case 11: r = kernel_tex_image_interp(__tex_image_byte4_011, x, y); break;
-		case 12: r = kernel_tex_image_interp(__tex_image_byte4_012, x, y); break;
-		case 13: r = kernel_tex_image_interp(__tex_image_byte4_013, x, y); break;
-		case 14: r = kernel_tex_image_interp(__tex_image_byte4_014, x, y); break;
-		case 15: r = kernel_tex_image_interp(__tex_image_byte4_015, x, y); break;
-		case 16: r = kernel_tex_image_interp(__tex_image_byte4_016, x, y); break;
 		case 17: r = kernel_tex_image_interp(__tex_image_byte4_017, x, y); break;
-		case 18: r = kernel_tex_image_interp(__tex_image_byte4_018, x, y); break;
-		case 19: r = kernel_tex_image_interp(__tex_image_byte4_019, x, y); break;
-		case 20: r = kernel_tex_image_interp(__tex_image_byte4_020, x, y); break;
-		case 21: r = kernel_tex_image_interp(__tex_image_byte4_021, x, y); break;
-		case 22: r = kernel_tex_image_interp(__tex_image_byte4_022, x, y); break;
-		case 23: r = kernel_tex_image_interp(__tex_image_byte4_023, x, y); break;
-		case 24: r = kernel_tex_image_interp(__tex_image_byte4_024, x, y); break;
 		case 25: r = kernel_tex_image_interp(__tex_image_byte4_025, x, y); break;
-		case 26: r = kernel_tex_image_interp(__tex_image_byte4_026, x, y); break;
-		case 27: r = kernel_tex_image_interp(__tex_image_byte4_027, x, y); break;
-		case 28: r = kernel_tex_image_interp(__tex_image_byte4_028, x, y); break;
-		case 29: r = kernel_tex_image_interp(__tex_image_byte4_029, x, y); break;
-		case 30: r = kernel_tex_image_interp(__tex_image_byte4_030, x, y); break;
-		case 31: r = kernel_tex_image_interp(__tex_image_byte4_031, x, y); break;
-		case 32: r = kernel_tex_image_interp(__tex_image_byte4_032, x, y); break;
 		case 33: r = kernel_tex_image_interp(__tex_image_byte4_033, x, y); break;
-		case 34: r = kernel_tex_image_interp(__tex_image_byte4_034, x, y); break;
-		case 35: r = kernel_tex_image_interp(__tex_image_byte4_035, x, y); break;
-		case 36: r = kernel_tex_image_interp(__tex_image_byte4_036, x, y); break;
-		case 37: r = kernel_tex_image_interp(__tex_image_byte4_037, x, y); break;
-		case 38: r = kernel_tex_image_interp(__tex_image_byte4_038, x, y); break;
-		case 39: r = kernel_tex_image_interp(__tex_image_byte4_039, x, y); break;
-		case 40: r = kernel_tex_image_interp(__tex_image_byte4_040, x, y); break;
 		case 41: r = kernel_tex_image_interp(__tex_image_byte4_041, x, y); break;
-		case 42: r = kernel_tex_image_interp(__tex_image_byte4_042, x, y); break;
-		case 43: r = kernel_tex_image_interp(__tex_image_byte4_043, x, y); break;
-		case 44: r = kernel_tex_image_interp(__tex_image_byte4_044, x, y); break;
-		case 45: r = kernel_tex_image_interp(__tex_image_byte4_045, x, y); break;
-		case 46: r = kernel_tex_image_interp(__tex_image_byte4_046, x, y); break;
-		case 47: r = kernel_tex_image_interp(__tex_image_byte4_047, x, y); break;
-		case 48: r = kernel_tex_image_interp(__tex_image_byte4_048, x, y); break;
 		case 49: r = kernel_tex_image_interp(__tex_image_byte4_049, x, y); break;
-		case 50: r = kernel_tex_image_interp(__tex_image_byte4_050, x, y); break;
-		case 51: r = kernel_tex_image_interp(__tex_image_byte4_051, x, y); break;
-		case 52: r = kernel_tex_image_interp(__tex_image_byte4_052, x, y); break;
-		case 53: r = kernel_tex_image_interp(__tex_image_byte4_053, x, y); break;
-		case 54: r = kernel_tex_image_interp(__tex_image_byte4_054, x, y); break;
-		case 55: r = kernel_tex_image_interp(__tex_image_byte4_055, x, y); break;
-		case 56: r = kernel_tex_image_interp(__tex_image_byte4_056, x, y); break;
 		case 57: r = kernel_tex_image_interp(__tex_image_byte4_057, x, y); break;
-		case 58: r = kernel_tex_image_interp(__tex_image_byte4_058, x, y); break;
-		case 59: r = kernel_tex_image_interp(__tex_image_byte4_059, x, y); break;
-		case 60: r = kernel_tex_image_interp(__tex_image_byte4_060, x, y); break;
-		case 61: r = kernel_tex_image_interp(__tex_image_byte4_061, x, y); break;
-		case 62: r = kernel_tex_image_interp(__tex_image_byte4_062, x, y); break;
-		case 63: r = kernel_tex_image_interp(__tex_image_byte4_063, x, y); break;
-		case 64: r = kernel_tex_image_interp(__tex_image_byte4_064, x, y); break;
 		case 65: r = kernel_tex_image_interp(__tex_image_byte4_065, x, y); break;
-		case 66: r = kernel_tex_image_interp(__tex_image_byte4_066, x, y); break;
-		case 67: r = kernel_tex_image_interp(__tex_image_byte4_067, x, y); break;
-		case 68: r = kernel_tex_image_interp(__tex_image_byte4_068, x, y); break;
-		case 69: r = kernel_tex_image_interp(__tex_image_byte4_069, x, y); break;
-		case 70: r = kernel_tex_image_interp(__tex_image_byte4_070, x, y); break;
-		case 71: r = kernel_tex_image_interp(__tex_image_byte4_071, x, y); break;
-		case 72: r = kernel_tex_image_interp(__tex_image_byte4_072, x, y); break;
 		case 73: r = kernel_tex_image_interp(__tex_image_byte4_073, x, y); break;
-		case 74: r = kernel_tex_image_interp(__tex_image_byte4_074, x, y); break;
-		case 75: r = kernel_tex_image_interp(__tex_image_byte4_075, x, y); break;
-		case 76: r = kernel_tex_image_interp(__tex_image_byte4_076, x, y); break;
-		case 77: r = kernel_tex_image_interp(__tex_image_byte4_077, x, y); break;
-		case 78: r = kernel_tex_image_interp(__tex_image_byte4_078, x, y); break;
-		case 79: r = kernel_tex_image_interp(__tex_image_byte4_079, x, y); break;
-		case 80: r = kernel_tex_image_interp(__tex_image_byte4_080, x, y); break;
 		case 81: r = kernel_tex_image_interp(__tex_image_byte4_081, x, y); break;
-		case 82: r = kernel_tex_image_interp(__tex_image_byte4_082, x, y); break;
-		case 83: r = kernel_tex_image_interp(__tex_image_byte4_083, x, y); break;
-		case 84: r = kernel_tex_image_interp(__tex_image_byte4_084, x, y); break;
-		case 85: r = kernel_tex_image_interp(__tex_image_byte4_085, x, y); break;
-		case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
-		case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
-		case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
+		case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
+		case 97: r = kernel_tex_image_interp(__tex_image_byte4_097, x, y); break;
+		case 105: r = kernel_tex_image_interp(__tex_image_byte4_105, x, y); break;
+		case 113: r = kernel_tex_image_interp(__tex_image_byte4_113, x, y); break;
+		case 121: r = kernel_tex_image_interp(__tex_image_byte4_121, x, y); break;
+		case 129: r = kernel_tex_image_interp(__tex_image_byte4_129, x, y); break;
+		case 137: r = kernel_tex_image_interp(__tex_image_byte4_137, x, y); break;
+		case 145: r = kernel_tex_image_interp(__tex_image_byte4_145, x, y); break;
+		case 153: r = kernel_tex_image_interp(__tex_image_byte4_153, x, y); break;
+		case 161: r = kernel_tex_image_interp(__tex_image_byte4_161, x, y); break;
+		case 169: r = kernel_tex_image_interp(__tex_image_byte4_169, x, y); break;
+		case 177: r = kernel_tex_image_interp(__tex_image_byte4_177, x, y); break;
+		case 185: r = kernel_tex_image_interp(__tex_image_byte4_185, x, y); break;
+		case 193: r = kernel_tex_image_interp(__tex_image_byte4_193, x, y); break;
+		case 201: r = kernel_tex_image_interp(__tex_image_byte4_201, x, y); break;
+		case 209: r = kernel_tex_image_interp(__tex_image_byte4_209, x, y); break;
+		case 217: r = kernel_tex_image_interp(__tex_image_byte4_217, x, y); break;
+		case 225: r = kernel_tex_image_interp(__tex_image_byte4_225, x, y); break;
+		case 233: r = kernel_tex_image_interp(__tex_image_byte4_233, x, y); break;
+		case 241: r = kernel_tex_image_interp(__tex_image_byte4_241, x, y); break;
+		case 249: r = kernel_tex_image_interp(__tex_image_byte4_249, x, y); break;
+		case 257: r = kernel_tex_image_interp(__tex_image_byte4_257, x, y); break;
+		case 265: r = kernel_tex_image_interp(__tex_image_byte4_265, x, y); break;
+		case 273: r = kernel_tex_image_interp(__tex_image_byte4_273, x, y); break;
+		case 281: r = kernel_tex_image_interp(__tex_image_byte4_281, x, y); break;
+		case 289: r = kernel_tex_image_interp(__tex_image_byte4_289, x, y); break;
+		case 297: r = kernel_tex_image_interp(__tex_image_byte4_297, x, y); break;
+		case 305: r = kernel_tex_image_interp(__tex_image_byte4_305, x, y); break;
+		case 313: r = kernel_tex_image_interp(__tex_image_byte4_313, x, y); break;
+		case 321: r = kernel_tex_image_interp(__tex_image_byte4_321, x, y); break;
+		case 329: r = kernel_tex_image_interp(__tex_image_byte4_329, x, y); break;
+		case 337: r = kernel_tex_image_interp(__tex_image_byte4_337, x, y); break;
+		case 345: r = kernel_tex_image_interp(__tex_image_byte4_345, x, y); break;
+		case 353: r = kernel_tex_image_interp(__tex_image_byte4_353, x, y); break;
+		case 361: r = kernel_tex_image_interp(__tex_image_byte4_361, x, y); break;
+		case 369: r = kernel_tex_image_interp(__tex_image_byte4_369, x, y); break;
+		case 377: r = kernel_tex_image_interp(__tex_image_byte4_377, x, y); break;
+		case 385: r = kernel_tex_image_interp(__tex_image_byte4_385, x, y); break;
+		case 393: r = kernel_tex_image_interp(__tex_image_byte4_393, x, y); break;
+		case 401: r = kernel_tex_image_interp(__tex_image_byte4_401, x, y); break;
+		case 409: r = kernel_tex_image_interp(__tex_image_byte4_409, x, y); break;
+		case 417: r = kernel_tex_image_interp(__tex_image_byte4_417, x, y); break;
+		case 425: r = kernel_tex_image_interp(__tex_image_byte4_425, x, y); break;
+		case 433: r = kernel_tex_image_interp(__tex_image_byte4_433, x, y); break;
+		case 441: r = kernel_tex_image_interp(__tex_image_byte4_441, x, y); break;
+		case 449: r = kernel_tex_image_interp(__tex_image_byte4_449, x, y); break;
+		case 457: r = kernel_tex_image_interp(__tex_image_byte4_457, x, y); break;
+		case 465: r = kernel_tex_image_interp(__tex_image_byte4_465, x, y); break;
+		case 473: r = kernel_tex_image_interp(__tex_image_byte4_473, x, y); break;
+		case 481: r = kernel_tex_image_interp(__tex_image_byte4_481, x, y); break;
+		case 489: r = kernel_tex_image_interp(__tex_image_byte4_489, x, y); break;
+		case 497: r = kernel_tex_image_interp(__tex_image_byte4_497, x, y); break;
+		case 505: r = kernel_tex_image_interp(__tex_image_byte4_505, x, y); break;
+		case 513: r = kernel_tex_image_interp(__tex_image_byte4_513, x, y); break;
+		case 521: r = kernel_tex_image_interp(__tex_image_byte4_521, x, y); break;
+		case 529: r = kernel_tex_image_interp(__tex_image_byte4_529, x, y); break;
+		case 537: r = kernel_tex_image_interp(__tex_image_byte4_537, x, y); break;
+		case 545: r = kernel_tex_image_interp(__tex_image_byte4_545, x, y); break;
+		case 553: r = kernel_tex_image_interp(__tex_image_byte4_553, x, y); break;
+		case 561: r = kernel_tex_image_interp(__tex_image_byte4_561, x, y); break;
+		case 569: r = kernel_tex_image_interp(__tex_image_byte4_569, x, y); break;
+		case 577: r = kernel_tex_image_interp(__tex_image_byte4_577, x, y); break;
+		case 585: r = kernel_tex_image_interp(__tex_image_byte4_585, x, y); break;
+		case 593: r = kernel_tex_image_interp(__tex_image_byte4_593, x, y); break;
+		case 601: r = kernel_tex_image_interp(__tex_image_byte4_601, x, y); break;
+		case 609: r = kernel_tex_image_interp(__tex_image_byte4_609, x, y); break;
+		case 617: r = kernel_tex_image_interp(__tex_image_byte4_617, x, y); break;
+		case 625: r = kernel_tex_image_interp(__tex_image_byte4_625, x, y); break;
+		case 633: r = kernel_tex_image_interp(__tex_image_byte4_633, x, y); break;
+		case 641: r = kernel_tex_image_interp(__tex_image_byte4_641, x, y); break;
+		case 649: r = kernel_tex_image_interp(__tex_image_byte4_649, x, y); break;
+		case 657: r = kernel_tex_image_interp(__tex_image_byte4_657, x, y); break;
+		case 665: r = kernel_tex_image_interp(__tex_image_byte4_665, x, y); break;
 		default:
 			kernel_assert(0);
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -224,6 +211,8 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 	object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
+	float3 signed_N = N;
+
 	N.x = fabsf(N.x);
 	N.y = fabsf(N.y);
 	N.z = fabsf(N.z);
@@ -293,12 +282,19 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
 	float4 f = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 	uint use_alpha = stack_valid(alpha_offset);
 
-	if(weight.x > 0.0f)
-		f += weight.x*svm_image_texture(kg, id, co.y, co.z, srgb, use_alpha);
-	if(weight.y > 0.0f)
-		f += weight.y*svm_image_texture(kg, id, co.x, co.z, srgb, use_alpha);
-	if(weight.z > 0.0f)
-		f += weight.z*svm_image_texture(kg, id, co.y, co.x, srgb, use_alpha);
+	/* Map so that no textures are flipped, rotation is somewhat arbitrary. */
+	if(weight.x > 0.0f) {
+		float2 uv = make_float2((signed_N.x < 0.0f)? 1.0f - co.y: co.y, co.z);
+		f += weight.x*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
+	if(weight.y > 0.0f) {
+		float2 uv = make_float2((signed_N.y > 0.0f)? 1.0f - co.x: co.x, co.z);
+		f += weight.y*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
+	if(weight.z > 0.0f) {
+		float2 uv = make_float2((signed_N.z > 0.0f)? 1.0f - co.y: co.y, co.x);
+		f += weight.z*svm_image_texture(kg, id, uv.x, uv.y, srgb, use_alpha);
+	}
 
 	if(stack_valid(out_offset))
 		stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 2d810ff664f..08203163d1a 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -221,28 +221,6 @@ OutputNode *ShaderGraph::output()
 	return (OutputNode*)nodes.front();
 }
 
-ShaderGraph *ShaderGraph::copy()
-{
-	ShaderGraph *newgraph = new ShaderGraph();
-
-	/* copy nodes */
-	ShaderNodeSet nodes_all;
-	foreach(ShaderNode *node, nodes)
-		nodes_all.insert(node);
-
-	ShaderNodeMap nodes_copy;
-	copy_nodes(nodes_all, nodes_copy);
-
-	/* add nodes (in same order, so output is still first) */
-	newgraph->clear_nodes();
-	foreach(ShaderNode *node, nodes)
-		newgraph->add(nodes_copy[node]);
-
-	newgraph->simplified = simplified;
-
-	return newgraph;
-}
-
 void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to)
 {
 	assert(!finalized);
@@ -1040,6 +1018,9 @@ int ShaderGraph::get_num_closures()
 		else if(CLOSURE_IS_PRINCIPLED(closure_type)) {
 			num_closures += 8;
 		}
+		else if(CLOSURE_IS_VOLUME(closure_type)) {
+			num_closures += VOLUME_STACK_SIZE;
+		}
 		else {
 			++num_closures;
 		}
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 72e391991a7..f0fd789c6bd 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -151,6 +151,7 @@ public:
 	virtual bool has_surface_emission() { return false; }
 	virtual bool has_surface_transparent() { return false; }
 	virtual bool has_surface_bssrdf() { return false; }
+	virtual bool has_bump() { return false; }
 	virtual bool has_bssrdf_bump() { return false; }
 	virtual bool has_spatial_varying() { return false; }
 	virtual bool has_object_dependency() { return false; }
@@ -245,8 +246,6 @@ public:
 	ShaderGraph();
 	~ShaderGraph();
 
-	ShaderGraph *copy();
-
 	ShaderNode *add(ShaderNode *node);
 	OutputNode *output();
 
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index f4482e0bb25..bb94b9bb82a 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -43,7 +43,6 @@ static bool isfinite(half /*value*/)
 ImageManager::ImageManager(const DeviceInfo& info)
 {
 	need_update = true;
-	pack_images = false;
 	osl_texture_system = NULL;
 	animation_frame = 0;
 
@@ -87,11 +86,6 @@ ImageManager::~ImageManager()
 	}
 }
 
-void ImageManager::set_pack_images(bool pack_images_)
-{
-	pack_images = pack_images_;
-}
-
 void ImageManager::set_osl_texture_system(void *texture_system)
 {
 	osl_texture_system = texture_system;
@@ -115,16 +109,18 @@ bool ImageManager::set_animation_frame_update(int frame)
 
 ImageDataType ImageManager::get_image_metadata(const string& filename,
                                                void *builtin_data,
-                                               bool& is_linear)
+                                               bool& is_linear,
+                                               bool& builtin_free_cache)
 {
 	bool is_float = false, is_half = false;
 	is_linear = false;
+	builtin_free_cache = false;
 	int channels = 4;
 
 	if(builtin_data) {
 		if(builtin_image_info_cb) {
 			int width, height, depth;
-			builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels);
+			builtin_image_info_cb(filename, builtin_data, is_float, width, height, depth, channels, builtin_free_cache);
 		}
 
 		if(is_float) {
@@ -218,37 +214,14 @@ int ImageManager::max_flattened_slot(ImageDataType type)
 /* The lower three bits of a device texture slot number indicate its type.
  * These functions convert the slot ids from ImageManager "images" ones
  * to device ones and vice verse.
- *
- * There are special cases for CUDA Fermi, since there we have only 90 image texture
- * slots available and should keep the flattended numbers in the 0-89 range.
  */
 int ImageManager::type_index_to_flattened_slot(int slot, ImageDataType type)
 {
-	if(cuda_fermi_limits) {
-		if(type == IMAGE_DATA_TYPE_BYTE4) {
-			return slot + TEX_START_BYTE4_CUDA;
-		}
-		else {
-			return slot;
-		}
-	}
-
 	return (slot << IMAGE_DATA_TYPE_SHIFT) | (type);
 }
 
 int ImageManager::flattened_slot_to_type_index(int flat_slot, ImageDataType *type)
 {
-	if(cuda_fermi_limits) {
-		if(flat_slot >= 4) {
-			*type = IMAGE_DATA_TYPE_BYTE4;
-			return flat_slot - TEX_START_BYTE4_CUDA;
-		}
-		else {
-			*type = IMAGE_DATA_TYPE_FLOAT4;
-			return flat_slot;
-		}
-	}
-
 	*type = (ImageDataType)(flat_slot & IMAGE_DATA_TYPE_MASK);
 	return flat_slot >> IMAGE_DATA_TYPE_SHIFT;
 }
@@ -295,8 +268,9 @@ int ImageManager::add_image(const string& filename,
 {
 	Image *img;
 	size_t slot;
+	bool builtin_free_cache;
 
-	ImageDataType type = get_image_metadata(filename, builtin_data, is_linear);
+	ImageDataType type = get_image_metadata(filename, builtin_data, is_linear, builtin_free_cache);
 
 	thread_scoped_lock device_lock(device_mutex);
 
@@ -364,7 +338,7 @@ int ImageManager::add_image(const string& filename,
 	else {
 		/* Very unlikely, since max_num_images is insanely big. But better safe than sorry. */
 		int tex_count = 0;
-		for (int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
+		for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
 			tex_count += tex_num_images[type];
 		}
 		if(tex_count > max_num_images) {
@@ -382,6 +356,7 @@ int ImageManager::add_image(const string& filename,
 	img = new Image();
 	img->filename = filename;
 	img->builtin_data = builtin_data;
+	img->builtin_free_cache = builtin_free_cache;
 	img->need_load = true;
 	img->animated = animated;
 	img->frame = frame;
@@ -467,7 +442,12 @@ void ImageManager::tag_reload_image(const string& filename,
 	}
 }
 
-bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components)
+bool ImageManager::file_load_image_generic(Image *img,
+                                           ImageInput **in,
+                                           int &width,
+                                           int &height,
+                                           int &depth,
+                                           int &components)
 {
 	if(img->filename == "")
 		return false;
@@ -506,8 +486,8 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid
 		if(!builtin_image_info_cb || !builtin_image_pixels_cb)
 			return false;
 
-		bool is_float;
-		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components);
+		bool is_float, free_cache;
+		builtin_image_info_cb(img->filename, img->builtin_data, is_float, width, height, depth, components, free_cache);
 	}
 
 	/* we only handle certain number of components */
@@ -542,6 +522,10 @@ bool ImageManager::file_load_image(Image *img,
 	vector<StorageType> pixels_storage;
 	StorageType *pixels;
 	const size_t max_size = max(max(width, height), depth);
+	if(max_size == 0) {
+		/* Don't bother with invalid images. */
+		return false;
+	}
 	if(texture_limit > 0 && max_size > texture_limit) {
 		pixels_storage.resize(((size_t)width)*height*depth*4);
 		pixels = &pixels_storage[0];
@@ -549,6 +533,10 @@ bool ImageManager::file_load_image(Image *img,
 	else {
 		pixels = (StorageType*)tex_img.resize(width, height, depth);
 	}
+	if(pixels == NULL) {
+		/* Could be that we've run out of memory. */
+		return false;
+	}
 	bool cmyk = false;
 	const size_t num_pixels = ((size_t)width) * height * depth;
 	if(in) {
@@ -588,13 +576,15 @@ bool ImageManager::file_load_image(Image *img,
 			builtin_image_float_pixels_cb(img->filename,
 			                              img->builtin_data,
 			                              (float*)&pixels[0],
-			                              num_pixels * components);
+			                              num_pixels * components,
+			                              img->builtin_free_cache);
 		}
 		else if(FileFormat == TypeDesc::UINT8) {
 			builtin_image_pixels_cb(img->filename,
 			                        img->builtin_data,
 			                        (uchar*)&pixels[0],
-			                        num_pixels * components);
+			                        num_pixels * components,
+			                        img->builtin_free_cache);
 		}
 		else {
 			/* TODO(dingto): Support half for ImBuf. */
@@ -754,7 +744,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = TEX_IMAGE_MISSING_A;
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -783,7 +773,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[0] = TEX_IMAGE_MISSING_R;
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -815,7 +805,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = (TEX_IMAGE_MISSING_A * 255);
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -843,7 +833,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[0] = (TEX_IMAGE_MISSING_R * 255);
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -874,7 +864,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[3] = TEX_IMAGE_MISSING_A;
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -902,7 +892,7 @@ void ImageManager::device_load_image(Device *device,
 			pixels[0] = TEX_IMAGE_MISSING_R;
 		}
 
-		if(!pack_images) {
+		{
 			thread_scoped_lock device_lock(device_mutex);
 			device->tex_alloc(name.c_str(),
 			                  tex_img,
@@ -1059,9 +1049,6 @@ void ImageManager::device_update(Device *device,
 
 	pool.wait_work();
 
-	if(pack_images)
-		device_pack_images(device, dscene, progress);
-
 	need_update = false;
 }
 
@@ -1091,141 +1078,6 @@ void ImageManager::device_update_slot(Device *device,
 	}
 }
 
-uint8_t ImageManager::pack_image_options(ImageDataType type, size_t slot)
-{
-	uint8_t options = 0;
-	/* Image Options are packed into one uint:
-	 * bit 0 -> Interpolation
-	 * bit 1 + 2 + 3 -> Extension
-	 */
-	if(images[type][slot]->interpolation == INTERPOLATION_CLOSEST) {
-		options |= (1 << 0);
-	}
-	if(images[type][slot]->extension == EXTENSION_REPEAT) {
-		options |= (1 << 1);
-	}
-	else if(images[type][slot]->extension == EXTENSION_EXTEND) {
-		options |= (1 << 2);
-	}
-	else /* EXTENSION_CLIP */ {
-		options |= (1 << 3);
-	}
-	return options;
-}
-
-template<typename T>
-void ImageManager::device_pack_images_type(
-        ImageDataType type,
-        const vector<device_vector<T>*>& cpu_textures,
-        device_vector<T> *device_image,
-        uint4 *info)
-{
-	size_t size = 0, offset = 0;
-	/* First step is to calculate size of the texture we need. */
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(images[type][slot] == NULL) {
-			continue;
-		}
-		device_vector<T>& tex_img = *cpu_textures[slot];
-		size += tex_img.size();
-	}
-	/* Now we know how much memory we need, so we can allocate and fill. */
-	T *pixels = device_image->resize(size);
-	for(size_t slot = 0; slot < images[type].size(); slot++) {
-		if(images[type][slot] == NULL) {
-			continue;
-		}
-		device_vector<T>& tex_img = *cpu_textures[slot];
-		uint8_t options = pack_image_options(type, slot);
-		const int index = type_index_to_flattened_slot(slot, type) * 2;
-		info[index] = make_uint4(tex_img.data_width,
-		                         tex_img.data_height,
-		                         offset,
-		                         options);
-		info[index+1] = make_uint4(tex_img.data_depth, 0, 0, 0);
-		memcpy(pixels + offset,
-		       (void*)tex_img.data_pointer,
-		       tex_img.memory_size());
-		offset += tex_img.size();
-	}
-}
-
-void ImageManager::device_pack_images(Device *device,
-                                      DeviceScene *dscene,
-                                      Progress& /*progess*/)
-{
-	/* For OpenCL, we pack all image textures into a single large texture, and
-	 * do our own interpolation in the kernel.
-	 */
-
-	/* TODO(sergey): This will over-allocate a bit, but this is constant memory
-	 * so should be fine for a short term.
-	 */
-	const size_t info_size = max4(max_flattened_slot(IMAGE_DATA_TYPE_FLOAT4),
-	                              max_flattened_slot(IMAGE_DATA_TYPE_BYTE4),
-	                              max_flattened_slot(IMAGE_DATA_TYPE_FLOAT),
-	                              max_flattened_slot(IMAGE_DATA_TYPE_BYTE));
-	uint4 *info = dscene->tex_image_packed_info.resize(info_size*2);
-
-	/* Pack byte4 textures. */
-	device_pack_images_type(IMAGE_DATA_TYPE_BYTE4,
-	                        dscene->tex_byte4_image,
-	                        &dscene->tex_image_byte4_packed,
-	                        info);
-	/* Pack float4 textures. */
-	device_pack_images_type(IMAGE_DATA_TYPE_FLOAT4,
-	                        dscene->tex_float4_image,
-	                        &dscene->tex_image_float4_packed,
-	                        info);
-	/* Pack byte textures. */
-	device_pack_images_type(IMAGE_DATA_TYPE_BYTE,
-	                        dscene->tex_byte_image,
-	                        &dscene->tex_image_byte_packed,
-	                        info);
-	/* Pack float textures. */
-	device_pack_images_type(IMAGE_DATA_TYPE_FLOAT,
-	                        dscene->tex_float_image,
-	                        &dscene->tex_image_float_packed,
-	                        info);
-
-	/* Push textures to the device. */
-	if(dscene->tex_image_byte4_packed.size()) {
-		if(dscene->tex_image_byte4_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_byte4_packed);
-		}
-		device->tex_alloc("__tex_image_byte4_packed", dscene->tex_image_byte4_packed);
-	}
-	if(dscene->tex_image_float4_packed.size()) {
-		if(dscene->tex_image_float4_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_float4_packed);
-		}
-		device->tex_alloc("__tex_image_float4_packed", dscene->tex_image_float4_packed);
-	}
-	if(dscene->tex_image_byte_packed.size()) {
-		if(dscene->tex_image_byte_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_byte_packed);
-		}
-		device->tex_alloc("__tex_image_byte_packed", dscene->tex_image_byte_packed);
-	}
-	if(dscene->tex_image_float_packed.size()) {
-		if(dscene->tex_image_float_packed.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_float_packed);
-		}
-		device->tex_alloc("__tex_image_float_packed", dscene->tex_image_float_packed);
-	}
-	if(dscene->tex_image_packed_info.size()) {
-		if(dscene->tex_image_packed_info.device_pointer) {
-			thread_scoped_lock device_lock(device_mutex);
-			device->tex_free(dscene->tex_image_packed_info);
-		}
-		device->tex_alloc("__tex_image_packed_info", dscene->tex_image_packed_info);
-	}
-}
-
 void ImageManager::device_free_builtin(Device *device, DeviceScene *dscene)
 {
 	for(int type = 0; type < IMAGE_DATA_NUM_TYPES; type++) {
@@ -1251,18 +1103,6 @@ void ImageManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->tex_float_image.clear();
 	dscene->tex_byte_image.clear();
 	dscene->tex_half_image.clear();
-
-	device->tex_free(dscene->tex_image_float4_packed);
-	device->tex_free(dscene->tex_image_byte4_packed);
-	device->tex_free(dscene->tex_image_float_packed);
-	device->tex_free(dscene->tex_image_byte_packed);
-	device->tex_free(dscene->tex_image_packed_info);
-
-	dscene->tex_image_float4_packed.clear();
-	dscene->tex_image_byte4_packed.clear();
-	dscene->tex_image_float_packed.clear();
-	dscene->tex_image_byte_packed.clear();
-	dscene->tex_image_packed_info.clear();
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 77214bf25bc..c86d1cbedbf 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -57,7 +57,10 @@ public:
 	                      InterpolationType interpolation,
 	                      ExtensionType extension,
 	                      bool use_alpha);
-	ImageDataType get_image_metadata(const string& filename, void *builtin_data, bool& is_linear);
+	ImageDataType get_image_metadata(const string& filename,
+	                                 void *builtin_data,
+	                                 bool& is_linear,
+	                                 bool& builtin_free_cache);
 
 	void device_prepare_update(DeviceScene *dscene);
 	void device_update(Device *device,
@@ -73,7 +76,6 @@ public:
 	void device_free_builtin(Device *device, DeviceScene *dscene);
 
 	void set_osl_texture_system(void *texture_system);
-	void set_pack_images(bool pack_images_);
 	bool set_animation_frame_update(int frame);
 
 	bool need_update;
@@ -88,19 +90,23 @@ public:
 	              int &width,
 	              int &height,
 	              int &depth,
-	              int &channels)> builtin_image_info_cb;
+	              int &channels,
+	              bool &free_cache)> builtin_image_info_cb;
 	function<bool(const string &filename,
 	              void *data,
 	              unsigned char *pixels,
-	              const size_t pixels_size)> builtin_image_pixels_cb;
+	              const size_t pixels_size,
+	              const bool free_cache)> builtin_image_pixels_cb;
 	function<bool(const string &filename,
 	              void *data,
 	              float *pixels,
-	              const size_t pixels_size)> builtin_image_float_pixels_cb;
+	              const size_t pixels_size,
+	              const bool free_cache)> builtin_image_float_pixels_cb;
 
 	struct Image {
 		string filename;
 		void *builtin_data;
+		bool builtin_free_cache;
 
 		bool use_alpha;
 		bool need_load;
@@ -123,9 +129,13 @@ private:
 
 	vector<Image*> images[IMAGE_DATA_NUM_TYPES];
 	void *osl_texture_system;
-	bool pack_images;
 
-	bool file_load_image_generic(Image *img, ImageInput **in, int &width, int &height, int &depth, int &components);
+	bool file_load_image_generic(Image *img,
+	                             ImageInput **in,
+	                             int &width,
+	                             int &height,
+	                             int &depth,
+	                             int &components);
 
 	template<TypeDesc::BASETYPE FileFormat,
 	         typename StorageType,
@@ -140,8 +150,6 @@ private:
 	int flattened_slot_to_type_index(int flat_slot, ImageDataType *type);
 	string name_from_type(int type);
 
-	uint8_t pack_image_options(ImageDataType type, size_t slot);
-
 	void device_load_image(Device *device,
 	                       DeviceScene *dscene,
 	                       Scene *scene,
@@ -152,17 +160,6 @@ private:
 	                       DeviceScene *dscene,
 	                       ImageDataType type,
 	                       int slot);
-
-	template<typename T>
-	void device_pack_images_type(
-	        ImageDataType type,
-	        const vector<device_vector<T>*>& cpu_textures,
-	        device_vector<T> *device_image,
-	        uint4 *info);
-
-	void device_pack_images(Device *device,
-	                        DeviceScene *dscene,
-	                        Progress& progess);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index a004bb5b856..15b728d6e02 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -31,7 +31,6 @@ NODE_DEFINE(Integrator)
 {
 	NodeType *type = NodeType::add("integrator", create);
 
-	SOCKET_INT(min_bounce, "Min Bounce", 2);
 	SOCKET_INT(max_bounce, "Max Bounce", 7);
 
 	SOCKET_INT(max_diffuse_bounce, "Max Diffuse Bounce", 7);
@@ -39,9 +38,7 @@ NODE_DEFINE(Integrator)
 	SOCKET_INT(max_transmission_bounce, "Max Transmission Bounce", 7);
 	SOCKET_INT(max_volume_bounce, "Max Volume Bounce", 7);
 
-	SOCKET_INT(transparent_min_bounce, "Transparent Min Bounce", 2);
 	SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
-	SOCKET_BOOLEAN(transparent_shadows, "Transparent Shadows", false);
 
 	SOCKET_INT(ao_bounces, "AO Bounces", 0);
 
@@ -104,7 +101,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
 	/* integrator parameters */
 	kintegrator->max_bounce = max_bounce + 1;
-	kintegrator->min_bounce = min_bounce + 1;
 
 	kintegrator->max_diffuse_bounce = max_diffuse_bounce + 1;
 	kintegrator->max_glossy_bounce = max_glossy_bounce + 1;
@@ -112,7 +108,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	kintegrator->max_volume_bounce = max_volume_bounce + 1;
 
 	kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
-	kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
 
 	if(ao_bounces == 0) {
 		kintegrator->ao_bounces = INT_MAX;
@@ -125,19 +120,14 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 	 * We only need to enable transparent shadows, if we actually have 
 	 * transparent shaders in the scene. Otherwise we can disable it
 	 * to improve performance a bit. */
-	if(transparent_shadows) {
-		kintegrator->transparent_shadows = false;
-		foreach(Shader *shader, scene->shaders) {
-			/* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
-			if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
-				kintegrator->transparent_shadows = true;
-				break;
-			}
+	kintegrator->transparent_shadows = false;
+	foreach(Shader *shader, scene->shaders) {
+		/* keep this in sync with SD_HAS_TRANSPARENT_SHADOW in shader.cpp */
+		if((shader->has_surface_transparent && shader->use_transparent_shadow) || shader->has_volume) {
+			kintegrator->transparent_shadows = true;
+			break;
 		}
 	}
-	else {
-		kintegrator->transparent_shadows = false;
-	}
 
 	kintegrator->volume_max_steps = volume_max_steps;
 	kintegrator->volume_step_size = volume_step_size;
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 9501d7f8416..3cb430d72b4 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -31,7 +31,6 @@ class Integrator : public Node {
 public:
 	NODE_DECLARE
 
-	int min_bounce;
 	int max_bounce;
 
 	int max_diffuse_bounce;
@@ -39,9 +38,7 @@ public:
 	int max_transmission_bounce;
 	int max_volume_bounce;
 
-	int transparent_min_bounce;
 	int transparent_max_bounce;
-	bool transparent_shadows;
 
 	int ao_bounces;
 
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 93d88c5642c..4adc00bc839 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -225,17 +225,13 @@ void LightManager::disable_ineffective_light(Device *device, Scene *scene)
 bool LightManager::object_usable_as_light(Object *object) {
 	Mesh *mesh = object->mesh;
 	/* Skip objects with NaNs */
-	if (!object->bounds.valid()) {
+	if(!object->bounds.valid()) {
 		return false;
 	}
 	/* Skip if we are not visible for BSDFs. */
 	if(!(object->visibility & (PATH_RAY_DIFFUSE|PATH_RAY_GLOSSY|PATH_RAY_TRANSMIT))) {
 		return false;
 	}
-	/* Skip motion blurred deforming meshes, not supported yet. */
-	if(mesh->has_motion_blur()) {
-		return false;
-	}
 	/* Skip if we have no emission shaders. */
 	/* TODO(sergey): Ideally we want to avoid such duplicated loop, since it'll
 	 * iterate all mesh shaders twice (when counting and when calculating
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index 03825f780e0..84537bf5993 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1925,16 +1925,7 @@ void MeshManager::device_update_displacement_images(Device *device,
 					if(node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
 						continue;
 					}
-					if(device->info.pack_images) {
-						/* If device requires packed images we need to update all
-						 * images now, even if they're not used for displacement.
-						 */
-						image_manager->device_update(device,
-						                             dscene,
-						                             scene,
-						                             progress);
-						return;
-					}
+
 					ImageSlotTextureNode *image_node = static_cast<ImageSlotTextureNode*>(node);
 					int slot = image_node->slot;
 					if(slot != -1) {
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 86e25df1da3..2b682756c6a 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -365,7 +365,8 @@ void ImageTextureNode::compile(OSLCompiler& compiler)
 	if(is_float == -1) {
 		if(builtin_data == NULL) {
 			ImageDataType type;
-			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear);
+			bool builtin_free_cache;
+			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache);
 			if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
 				is_float = 1;
 		}
@@ -554,7 +555,8 @@ void EnvironmentTextureNode::compile(OSLCompiler& compiler)
 	if(is_float == -1) {
 		if(builtin_data == NULL) {
 			ImageDataType type;
-			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear);
+			bool builtin_free_cache;
+			type = image_manager->get_image_metadata(filename.string(), NULL, is_linear, builtin_free_cache);
 			if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
 				is_float = 1;
 		}
@@ -1799,6 +1801,14 @@ BsdfBaseNode::BsdfBaseNode(const NodeType *node_type)
 	special_type = SHADER_SPECIAL_TYPE_CLOSURE;
 }
 
+bool BsdfBaseNode::has_bump()
+{
+	/* detect if anything is plugged into the normal input besides the default */
+	ShaderInput *normal_in = input("Normal");
+	return (normal_in && normal_in->link &&
+	        normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY);
+}
+
 /* BSDF Closure */
 
 BsdfNode::BsdfNode(const NodeType *node_type)
@@ -2437,9 +2447,7 @@ void PrincipledBsdfNode::compile(OSLCompiler& compiler)
 
 bool PrincipledBsdfNode::has_bssrdf_bump()
 {
-	/* detect if anything is plugged into the normal input besides the default */
-	ShaderInput *normal_in = input("Normal");
-	return (normal_in->link && normal_in->link->parent->special_type != SHADER_SPECIAL_TYPE_GEOMETRY);
+	return has_surface_bssrdf() && has_bump();
 }
 
 /* Translucent BSDF Closure */
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index c0271a3c8eb..ec4c7c7c50d 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -326,6 +326,16 @@ class BsdfBaseNode : public ShaderNode {
 public:
 	BsdfBaseNode(const NodeType *node_type);
 
+	bool has_spatial_varying() { return true; }
+	virtual ClosureType get_closure_type() { return closure; }
+	virtual bool has_bump();
+
+	virtual bool equals(const ShaderNode& /*other*/)
+	{
+		/* TODO(sergey): With some care BSDF nodes can be de-duplicated. */
+		return false;
+	}
+
 	ClosureType closure;
 };
 
@@ -334,19 +344,11 @@ public:
 	explicit BsdfNode(const NodeType *node_type);
 	SHADER_NODE_BASE_CLASS(BsdfNode)
 
-	bool has_spatial_varying() { return true; }
 	void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL);
-	virtual ClosureType get_closure_type() { return closure; }
 
 	float3 color;
 	float3 normal;
 	float surface_mix_weight;
-
-	virtual bool equals(const ShaderNode& /*other*/)
-	{
-		/* TODO(sergey): With some care BSDF nodes can be de-duplicated. */
-		return false;
-	}
 };
 
 class AnisotropicBsdfNode : public BsdfNode {
@@ -373,7 +375,6 @@ class PrincipledBsdfNode : public BsdfBaseNode {
 public:
 	SHADER_NODE_CLASS(PrincipledBsdfNode)
 
-	bool has_spatial_varying() { return true; }
 	bool has_surface_bssrdf();
 	bool has_bssrdf_bump();
 	void compile(SVMCompiler& compiler, ShaderInput *metallic, ShaderInput *subsurface, ShaderInput *subsurface_radius,
@@ -390,13 +391,6 @@ public:
 	float surface_mix_weight;
 	ClosureType distribution, distribution_orig;
 
-	virtual bool equals(const ShaderNode * /*other*/)
-	{
-		/* TODO(sergey): With some care BSDF nodes can be de-duplicated. */
-		return false;
-	}
-
-	ClosureType get_closure_type() { return closure; }
 	bool has_integrator_dependency();
 	void attributes(Shader *shader, AttributeRequestSet *attributes);
 };
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 375abfeb27a..12690090066 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -262,6 +262,17 @@ bool Object::is_traceable()
 	return true;
 }
 
+uint Object::visibility_for_tracing() const {
+	uint trace_visibility = visibility;
+	if (is_shadow_catcher) {
+		trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER;
+	}
+	else {
+		trace_visibility &= ~PATH_RAY_SHADOW_CATCHER;
+	}
+	return trace_visibility;
+}
+
 /* Object Manager */
 
 ObjectManager::ObjectManager()
@@ -356,6 +367,13 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s
 	/* OBJECT_PROPERTIES */
 	objects[offset+8] = make_float4(surface_area, pass_id, random_number, __int_as_float(particle_index));
 
+	if(mesh->use_motion_blur) {
+		state->have_motion = true;
+	}
+	if(mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION)) {
+		flag |= SD_OBJECT_HAS_VERTEX_MOTION;
+	}
+
 	if(state->need_motion == Scene::MOTION_PASS) {
 		/* Motion transformations, is world/object space depending if mesh
 		 * comes with deformed position in object space, or if we transform
@@ -376,9 +394,6 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s
 			mtfm.pre = mtfm.pre * itfm;
 			mtfm.post = mtfm.post * itfm;
 		}
-		else {
-			flag |= SD_OBJECT_HAS_VERTEX_MOTION;
-		}
 
 		memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+0], &mtfm.pre, sizeof(float4)*3);
 		memcpy(&objects_vector[object_index*OBJECT_VECTOR_SIZE+3], &mtfm.post, sizeof(float4)*3);
@@ -397,10 +412,6 @@ void ObjectManager::device_update_object_transform(UpdateObejctTransformState *s
 	}
 #endif
 
-	if(mesh->use_motion_blur) {
-		state->have_motion = true;
-	}
-
 	/* Dupli object coords and motion info. */
 	int totalsteps = mesh->motion_steps;
 	int numsteps = (totalsteps - 1)/2;
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 12d7b2c81cf..6927bbfe4c7 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -60,7 +60,7 @@ public:
 
 	ParticleSystem *particle_system;
 	int particle_index;
-	
+
 	Object();
 	~Object();
 
@@ -75,6 +75,11 @@ public:
 	 * kernel scene.
 	 */
 	bool is_traceable();
+
+	/* Combine object's visibility with all possible internal run-time
+	 * determined flags which denotes trace-time visibility.
+	 */
+	uint visibility_for_tracing() const;
 };
 
 /* Object Manager */
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index a794f233718..5c5ac6e2be9 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -233,8 +233,10 @@ void OSLShaderManager::shading_system_init()
 			"glossy",			/* PATH_RAY_GLOSSY */
 			"singular",			/* PATH_RAY_SINGULAR */
 			"transparent",		/* PATH_RAY_TRANSPARENT */
-			"shadow",			/* PATH_RAY_SHADOW_OPAQUE */
-			"shadow",			/* PATH_RAY_SHADOW_TRANSPARENT */
+			"shadow",			/* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */
+			"shadow",			/* PATH_RAY_SHADOW_OPAQUE_CATCHER */
+			"shadow",			/* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */
+			"shadow",			/* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */
 
 			"__unused__",
 			"__unused__",
@@ -719,6 +721,7 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
 				current_shader->has_surface_bssrdf = true;
 				current_shader->has_bssrdf_bump = true; /* can't detect yet */
 			}
+			current_shader->has_bump = true; /* can't detect yet */
 		}
 
 		if(node->has_spatial_varying()) {
@@ -1027,6 +1030,9 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet& nodes)
 							if(node->has_bssrdf_bump())
 								current_shader->has_bssrdf_bump = true;
 						}
+						if(node->has_bump()) {
+							current_shader->has_bump = true;
+						}
 					}
 					else if(current_type == SHADER_TYPE_VOLUME) {
 						if(node->has_spatial_varying())
@@ -1089,21 +1095,14 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
 		ShaderGraph *graph = shader->graph;
 		ShaderNode *output = (graph)? graph->output(): NULL;
 
-		/* copy graph for shader with bump mapping */
-		if(output->input("Surface")->link && output->input("Displacement")->link)
-			if(!shader->graph_bump)
-				shader->graph_bump = shader->graph->copy();
+		bool has_bump = (shader->displacement_method != DISPLACE_TRUE) &&
+		                output->input("Surface")->link && output->input("Displacement")->link;
 
 		/* finalize */
 		shader->graph->finalize(scene,
-		                        false,
-		                        shader->has_integrator_dependency);
-		if(shader->graph_bump) {
-			shader->graph_bump->finalize(scene,
-			                             true,
-			                             shader->has_integrator_dependency,
-			                             shader->displacement_method == DISPLACE_BOTH);
-		}
+		                        has_bump,
+		                        shader->has_integrator_dependency,
+		                        shader->displacement_method == DISPLACE_BOTH);
 
 		current_shader = shader;
 
@@ -1111,7 +1110,8 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
 		shader->has_surface_emission = false;
 		shader->has_surface_transparent = false;
 		shader->has_surface_bssrdf = false;
-		shader->has_bssrdf_bump = false;
+		shader->has_bump = has_bump;
+		shader->has_bssrdf_bump = has_bump;
 		shader->has_volume = false;
 		shader->has_displacement = false;
 		shader->has_surface_spatial_varying = false;
@@ -1123,8 +1123,8 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
 		if(shader->used && graph && output->input("Surface")->link) {
 			shader->osl_surface_ref = compile_type(shader, shader->graph, SHADER_TYPE_SURFACE);
 
-			if(shader->graph_bump && shader->displacement_method != DISPLACE_TRUE)
-				shader->osl_surface_bump_ref = compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP);
+			if(has_bump)
+				shader->osl_surface_bump_ref = compile_type(shader, shader->graph, SHADER_TYPE_BUMP);
 			else
 				shader->osl_surface_bump_ref = OSL::ShaderGroupRef();
 
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 4db20338744..c59a5d97df5 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -148,8 +148,6 @@ void Scene::device_update(Device *device_, Progress& progress)
 	 * - Film needs light manager to run for use_light_visibility
 	 * - Lookup tables are done a second time to handle film tables
 	 */
-	
-	image_manager->set_pack_images(device->info.pack_images);
 
 	progress.set_status("Updating Shaders");
 	shader_manager->device_update(device, &dscene, this, progress);
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 4c2c4f5fcc3..0194327f567 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -121,13 +121,6 @@ public:
 	vector<device_vector<uchar>* > tex_byte_image;
 	vector<device_vector<half>* > tex_half_image;
 
-	/* opencl images */
-	device_vector<float4> tex_image_float4_packed;
-	device_vector<uchar4> tex_image_byte4_packed;
-	device_vector<float> tex_image_float_packed;
-	device_vector<uchar> tex_image_byte_packed;
-	device_vector<uint4> tex_image_packed_info;
-
 	KernelData data;
 };
 
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index ae462a1084a..f68efe38add 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -46,7 +46,7 @@ Session::Session(const SessionParams& params_)
 : params(params_),
   tile_manager(params.progressive, params.samples, params.tile_size, params.start_resolution,
        params.background == false || params.progressive_refine, params.background, params.tile_order,
-       max(params.device.multi_devices.size(), 1)),
+       max(params.device.multi_devices.size(), 1), params.pixel_size),
   stats()
 {
 	device_use_gl = ((params.device.type != DEVICE_CPU) && !params.background);
@@ -721,7 +721,6 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 	BakeManager *bake_manager = scene->bake_manager;
 	requested_features.use_baking = bake_manager->get_baking();
 	requested_features.use_integrator_branched = (scene->integrator->method == Integrator::BRANCHED_PATH);
-	requested_features.use_transparent &= scene->integrator->transparent_shadows;
 	requested_features.use_denoising = params.use_denoising;
 
 	return requested_features;
@@ -931,7 +930,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 		const bool rendering_finished = (tile == num_tiles);
 		const bool is_last_tile = (tile + 1) == num_tiles;
 
-		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
+		substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles);
 
 		if(!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
 			/* Some devices automatically support showing the sample number:
@@ -961,6 +960,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	}
 	else if(show_done) {
 		status = "Done";
+		progress.set_end_time(); /* Save end time so that further calls to get_time are accurate. */
 	}
 	else {
 		status = substatus;
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 9f8bb8c42fa..980eda0876d 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -53,6 +53,7 @@ public:
 	int2 tile_size;
 	TileOrder tile_order;
 	int start_resolution;
+	int pixel_size;
 	int threads;
 
 	bool display_buffer_linear;
@@ -81,6 +82,7 @@ public:
 		samples = INT_MAX;
 		tile_size = make_int2(64, 64);
 		start_resolution = INT_MAX;
+		pixel_size = 1;
 		threads = 0;
 
 		use_denoising = false;
@@ -110,6 +112,7 @@ public:
 		&& experimental == params.experimental
 		&& tile_size == params.tile_size
 		&& start_resolution == params.start_resolution
+		&& pixel_size == params.pixel_size
 		&& threads == params.threads
 		&& display_buffer_linear == params.display_buffer_linear
 		&& cancel_timeout == params.cancel_timeout
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 50400edd5ca..86378dfb495 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -177,7 +177,6 @@ Shader::Shader()
 	pass_id = 0;
 
 	graph = NULL;
-	graph_bump = NULL;
 
 	has_surface = false;
 	has_surface_transparent = false;
@@ -185,11 +184,13 @@ Shader::Shader()
 	has_surface_bssrdf = false;
 	has_volume = false;
 	has_displacement = false;
+	has_bump = false;
 	has_bssrdf_bump = false;
 	has_surface_spatial_varying = false;
 	has_volume_spatial_varying = false;
 	has_object_dependency = false;
 	has_integrator_dependency = false;
+	has_volume_connected = false;
 
 	displacement_method = DISPLACE_BUMP;
 
@@ -203,7 +204,6 @@ Shader::Shader()
 Shader::~Shader()
 {
 	delete graph;
-	delete graph_bump;
 }
 
 bool Shader::is_constant_emission(float3 *emission)
@@ -238,9 +238,7 @@ void Shader::set_graph(ShaderGraph *graph_)
 
 	/* assign graph */
 	delete graph;
-	delete graph_bump;
 	graph = graph_;
-	graph_bump = NULL;
 
 	/* Store info here before graph optimization to make sure that
 	 * nodes that get optimized away still count. */
@@ -457,15 +455,11 @@ void ShaderManager::device_update_common(Device *device,
 			flag |= SD_VOLUME_MIS;
 		if(shader->volume_interpolation_method == VOLUME_INTERPOLATION_CUBIC)
 			flag |= SD_VOLUME_CUBIC;
-		if(shader->graph_bump)
+		if(shader->has_bump)
 			flag |= SD_HAS_BUMP;
 		if(shader->displacement_method != DISPLACE_BUMP)
 			flag |= SD_HAS_DISPLACEMENT;
 
-		/* shader with bump mapping */
-		if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump)
-			flag |= SD_HAS_BSSRDF_BUMP;
-
 		/* constant emission check */
 		float3 constant_emission = make_float3(0.0f, 0.0f, 0.0f);
 		if(shader->is_constant_emission(&constant_emission))
@@ -502,9 +496,7 @@ void ShaderManager::device_update_common(Device *device,
 	KernelIntegrator *kintegrator = &dscene->data.integrator;
 	kintegrator->use_volumes = has_volumes;
 	/* TODO(sergey): De-duplicate with flags set in integrator.cpp. */
-	if(scene->integrator->transparent_shadows) {
-		kintegrator->transparent_shadows = has_transparent_shadow;
-	}
+	kintegrator->transparent_shadows = has_transparent_shadow;
 }
 
 void ShaderManager::device_free_common(Device *device, DeviceScene *dscene, Scene *scene)
@@ -609,11 +601,6 @@ void ShaderManager::get_requested_features(Scene *scene,
 		Shader *shader = scene->shaders[i];
 		/* Gather requested features from all the nodes from the graph nodes. */
 		get_requested_graph_features(shader->graph, requested_features);
-		/* Gather requested features from the graph itself. */
-		if(shader->graph_bump) {
-			get_requested_graph_features(shader->graph_bump,
-			                             requested_features);
-		}
 		ShaderNode *output_node = shader->graph->output();
 		if(output_node->input("Displacement")->link != NULL) {
 			requested_features->nodes_features |= NODE_FEATURE_BUMP;
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index b6714b13247..79a67d6756a 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -89,11 +89,6 @@ public:
 	/* shader graph */
 	ShaderGraph *graph;
 
-	/* shader graph with auto bump mapping included, we compile two shaders,
-	 * with and without bump,  because the displacement method is a mesh
-	 * level setting, so we need to handle both */
-	ShaderGraph *graph_bump;
-
 	/* sampling */
 	bool use_mis;
 	bool use_transparent_shadow;
@@ -121,6 +116,7 @@ public:
 	bool has_volume;
 	bool has_displacement;
 	bool has_surface_bssrdf;
+	bool has_bump;
 	bool has_bssrdf_bump;
 	bool has_surface_spatial_varying;
 	bool has_volume_spatial_varying;
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 48287d872d4..32f89897970 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -521,6 +521,9 @@ void SVMCompiler::generate_closure_node(ShaderNode *node,
 			if(node->has_bssrdf_bump())
 				current_shader->has_bssrdf_bump = true;
 		}
+		if(node->has_bump()) {
+			current_shader->has_bump = true;
+		}
 	}
 }
 
@@ -799,29 +802,21 @@ void SVMCompiler::compile(Scene *scene,
                           Summary *summary)
 {
 	/* copy graph for shader with bump mapping */
-	ShaderNode *node = shader->graph->output();
+	ShaderNode *output = shader->graph->output();
 	int start_num_svm_nodes = svm_nodes.size();
 
 	const double time_start = time_dt();
 
-	if(node->input("Surface")->link && node->input("Displacement")->link)
-		if(!shader->graph_bump)
-			shader->graph_bump = shader->graph->copy();
+	bool has_bump = (shader->displacement_method != DISPLACE_TRUE) &&
+	                output->input("Surface")->link && output->input("Displacement")->link;
 
 	/* finalize */
 	{
 		scoped_timer timer((summary != NULL)? &summary->time_finalize: NULL);
 		shader->graph->finalize(scene,
-		                        false,
-		                        shader->has_integrator_dependency);
-	}
-
-	if(shader->graph_bump) {
-		scoped_timer timer((summary != NULL)? &summary->time_finalize_bump: NULL);
-		shader->graph_bump->finalize(scene,
-		                             true,
-		                             shader->has_integrator_dependency,
-		                             shader->displacement_method == DISPLACE_BOTH);
+		                        has_bump,
+		                        shader->has_integrator_dependency,
+		                        shader->displacement_method == DISPLACE_BOTH);
 	}
 
 	current_shader = shader;
@@ -830,7 +825,8 @@ void SVMCompiler::compile(Scene *scene,
 	shader->has_surface_emission = false;
 	shader->has_surface_transparent = false;
 	shader->has_surface_bssrdf = false;
-	shader->has_bssrdf_bump = false;
+	shader->has_bump = has_bump;
+	shader->has_bssrdf_bump = has_bump;
 	shader->has_volume = false;
 	shader->has_displacement = false;
 	shader->has_surface_spatial_varying = false;
@@ -839,9 +835,9 @@ void SVMCompiler::compile(Scene *scene,
 	shader->has_integrator_dependency = false;
 
 	/* generate bump shader */
-	if(shader->displacement_method != DISPLACE_TRUE && shader->graph_bump) {
+	if(has_bump) {
 		scoped_timer timer((summary != NULL)? &summary->time_generate_bump: NULL);
-		compile_type(shader, shader->graph_bump, SHADER_TYPE_BUMP);
+		compile_type(shader, shader->graph, SHADER_TYPE_BUMP);
 		svm_nodes[index].y = svm_nodes.size();
 		svm_nodes.insert(svm_nodes.end(),
 		                 current_svm_nodes.begin(),
@@ -853,7 +849,7 @@ void SVMCompiler::compile(Scene *scene,
 		scoped_timer timer((summary != NULL)? &summary->time_generate_surface: NULL);
 		compile_type(shader, shader->graph, SHADER_TYPE_SURFACE);
 		/* only set jump offset if there's no bump shader, as the bump shader will fall thru to this one if it exists */
-		if(shader->displacement_method == DISPLACE_TRUE || !shader->graph_bump) {
+		if(!has_bump) {
 			svm_nodes[index].y = svm_nodes.size();
 		}
 		svm_nodes.insert(svm_nodes.end(),
@@ -895,7 +891,6 @@ SVMCompiler::Summary::Summary()
 	: num_svm_nodes(0),
 	  peak_stack_usage(0),
 	  time_finalize(0.0),
-	  time_finalize_bump(0.0),
 	  time_generate_surface(0.0),
 	  time_generate_bump(0.0),
 	  time_generate_volume(0.0),
@@ -911,10 +906,7 @@ string SVMCompiler::Summary::full_report() const
 	report += string_printf("Peak stack usage:    %d\n", peak_stack_usage);
 
 	report += string_printf("Time (in seconds):\n");
-	report += string_printf("  Finalize:          %f\n", time_finalize);
-	report += string_printf("  Bump finalize:     %f\n", time_finalize_bump);
-	report += string_printf("Finalize:            %f\n", time_finalize +
-	                                                     time_finalize_bump);
+	report += string_printf("Finalize:            %f\n", time_finalize);
 	report += string_printf("  Surface:           %f\n", time_generate_surface);
 	report += string_printf("  Bump:              %f\n", time_generate_bump);
 	report += string_printf("  Volume:            %f\n", time_generate_volume);
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index abbd9e50610..98ef5fa05d8 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -74,9 +74,6 @@ public:
 		/* Time spent on surface graph finalization. */
 		double time_finalize;
 
-		/* Time spent on bump graph finalization. */
-		double time_finalize_bump;
-
 		/* Time spent on generating SVM nodes for surface shader. */
 		double time_generate_surface;
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 176a1f4f0f3..a9620f79fa0 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -88,12 +88,14 @@ enum SpiralDirection {
 }  /* namespace */
 
 TileManager::TileManager(bool progressive_, int num_samples_, int2 tile_size_, int start_resolution_,
-                         bool preserve_tile_device_, bool background_, TileOrder tile_order_, int num_devices_)
+                         bool preserve_tile_device_, bool background_, TileOrder tile_order_,
+                         int num_devices_, int pixel_size_)
 {
 	progressive = progressive_;
 	tile_size = tile_size_;
 	tile_order = tile_order_;
 	start_resolution = start_resolution_;
+	pixel_size = pixel_size_;
 	num_samples = num_samples_;
 	num_devices = num_devices_;
 	preserve_tile_device = preserve_tile_device_;
@@ -163,15 +165,17 @@ void TileManager::set_samples(int num_samples_)
 		uint64_t pixel_samples = 0;
 		/* While rendering in the viewport, the initial preview resolution is increased to the native resolution
 		 * before the actual rendering begins. Therefore, additional pixel samples will be rendered. */
-		int divider = get_divider(params.width, params.height, start_resolution) / 2;
-		while(divider > 1) {
+		int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size);
+		while(divider > pixel_size) {
 			int image_w = max(1, params.width/divider);
 			int image_h = max(1, params.height/divider);
 			pixel_samples += image_w * image_h;
 			divider >>= 1;
 		}
 
-		state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * params.width*params.height;
+		int image_w = max(1, params.width/divider);
+		int image_h = max(1, params.height/divider);
+		state.total_pixel_samples = pixel_samples + (uint64_t)get_num_effective_samples() * image_w*image_h;
 		if(schedule_denoising) {
 			state.total_pixel_samples += params.width*params.height;
 		}
@@ -471,7 +475,7 @@ bool TileManager::done()
 	int end_sample = (range_num_samples == -1)
 	                     ? num_samples
 	                     : range_start_sample + range_num_samples;
-	return (state.resolution_divider == 1) &&
+	return (state.resolution_divider == pixel_size) &&
 	       (state.sample+state.num_samples >= end_sample);
 }
 
@@ -480,9 +484,9 @@ bool TileManager::next()
 	if(done())
 		return false;
 
-	if(progressive && state.resolution_divider > 1) {
+	if(progressive && state.resolution_divider > pixel_size) {
 		state.sample = 0;
-		state.resolution_divider /= 2;
+		state.resolution_divider = max(state.resolution_divider/2, pixel_size);
 		state.num_samples = 1;
 		set_tiles();
 	}
@@ -496,7 +500,7 @@ bool TileManager::next()
 		else
 			state.num_samples = range_num_samples;
 
-		state.resolution_divider = 1;
+		state.resolution_divider = pixel_size;
 		set_tiles();
 	}
 
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index e39a8f0627a..4cd57b7b30c 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -88,7 +88,7 @@ public:
 	int num_samples;
 
 	TileManager(bool progressive, int num_samples, int2 tile_size, int start_resolution,
-	            bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1);
+	            bool preserve_tile_device, bool background, TileOrder tile_order, int num_devices = 1, int pixel_size = 1);
 	~TileManager();
 
 	void free_device();
@@ -122,6 +122,7 @@ protected:
 	int2 tile_size;
 	TileOrder tile_order;
 	int start_resolution;
+	int pixel_size;
 	int num_devices;
 
 	/* in some cases it is important that the same tile will be returned for the same
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index 43f9a57d099..7f3747a0f58 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -38,6 +38,7 @@ set(SRC_HEADERS
 	util_atomic.h
 	util_boundbox.h
 	util_debug.h
+	util_defines.h
 	util_guarded_allocator.cpp
 	util_foreach.h
 	util_function.h
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 643af87a65f..f3c7ae546a0 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -22,16 +22,6 @@
 /* Using atomic ops header from Blender. */
 #include "atomic_ops.h"
 
-ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
-{
-	size_t prev_value = *maximum_value;
-	while(prev_value < value) {
-		if(atomic_cas_z(maximum_value, prev_value, value) != prev_value) {
-			break;
-		}
-	}
-}
-
 #define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
 
 #define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 10895f2e918..eb078d69252 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -122,13 +122,16 @@ void DebugFlags::OpenCL::reset()
 }
 
 DebugFlags::DebugFlags()
+: viewport_static_bvh(false)
 {
 	/* Nothing for now. */
 }
 
 void DebugFlags::reset()
 {
+	viewport_static_bvh = false;
 	cpu.reset();
+	cuda.reset();
 	opencl.reset();
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 450cd900a9f..9255279c5ab 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -30,6 +30,9 @@ CCL_NAMESPACE_BEGIN
  */
 class DebugFlags {
 public:
+	/* Use static BVH in viewport, to match final render exactly. */
+	bool viewport_static_bvh;
+
 	/* Descriptor of CPU feature-set to be used. */
 	struct CPU {
 		CPU();
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
new file mode 100644
index 00000000000..ae654092c87
--- /dev/null
+++ b/intern/cycles/util/util_defines.h
@@ -0,0 +1,135 @@
+
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_DEFINES_H__
+#define __UTIL_DEFINES_H__
+
+/* Bitness */
+
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
+#  define __KERNEL_64_BIT__
+#endif
+
+/* Qualifiers for kernel code shared by CPU and GPU */
+
+#ifndef __KERNEL_GPU__
+#  define ccl_device static inline
+#  define ccl_device_noinline static
+#  define ccl_global
+#  define ccl_constant
+#  define ccl_local
+#  define ccl_local_param
+#  define ccl_private
+#  define ccl_restrict __restrict
+#  define ccl_ref &
+#  define __KERNEL_WITH_SSE_ALIGN__
+
+#  if defined(_WIN32) && !defined(FREE_WINDOWS)
+#    define ccl_device_inline static __forceinline
+#    define ccl_device_forceinline static __forceinline
+#    define ccl_align(...) __declspec(align(__VA_ARGS__))
+#    ifdef __KERNEL_64_BIT__
+#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+#    else  /* __KERNEL_64_BIT__ */
+#      undef __KERNEL_WITH_SSE_ALIGN__
+/* No support for function arguments (error C2719). */
+#      define ccl_try_align(...)
+#    endif  /* __KERNEL_64_BIT__ */
+#    define ccl_may_alias
+#    define ccl_always_inline __forceinline
+#    define ccl_never_inline __declspec(noinline)
+#    define ccl_maybe_unused
+#  else  /* _WIN32 && !FREE_WINDOWS */
+#    define ccl_device_inline static inline __attribute__((always_inline))
+#    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    ifndef FREE_WINDOWS64
+#      define __forceinline inline __attribute__((always_inline))
+#    endif
+#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    define ccl_may_alias __attribute__((__may_alias__))
+#    define ccl_always_inline __attribute__((always_inline))
+#    define ccl_never_inline __attribute__((noinline))
+#    define ccl_maybe_unused __attribute__((used))
+#  endif  /* _WIN32 && !FREE_WINDOWS */
+
+/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
+#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
+#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#  else
+#    define ATTR_FALLTHROUGH ((void)0)
+#  endif
+#endif  /* __KERNEL_GPU__ */
+
+/* macros */
+
+/* hints for branch prediction, only use in code that runs a _lot_ */
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#  define LIKELY(x)       __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x)       (x)
+#  define UNLIKELY(x)     (x)
+#endif
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
+#  define HAS_CPP11_FEATURES
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(HAS_CPP11_FEATURES)
+/* Some magic to be sure we don't have reference in the type. */
+template<typename T> static inline T decltype_helper(T x) { return x; }
+#    define TYPEOF(x) decltype(decltype_helper(x))
+#  else
+#    define TYPEOF(x) typeof(x)
+#  endif
+#endif
+
+/* Causes warning:
+ * incompatible types when assigning to type 'Foo' from type 'Bar'
+ * ... the compiler optimizes away the temp var */
+#ifdef __GNUC__
+#define CHECK_TYPE(var, type)  {  \
+	TYPEOF(var) *__tmp;           \
+	__tmp = (type *)NULL;         \
+	(void)__tmp;                  \
+} (void)0
+
+#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
+	TYPEOF(var_a) *__tmp;                 \
+	__tmp = (typeof(var_b) *)NULL;        \
+	(void)__tmp;                          \
+} (void)0
+#else
+#  define CHECK_TYPE(var, type)
+#  define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
+
+/* can be used in simple macros */
+#define CHECK_TYPE_INLINE(val, type) \
+	((void)(((type)0) != (val)))
+
+#ifndef __KERNEL_GPU__
+#  include <cassert>
+#  define util_assert(statement)  assert(statement)
+#else
+#  define util_assert(statement)
+#endif
+
+#endif /* __UTIL_DEFINES_H__ */
+
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index b719640b19c..fb04d49bcd9 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -94,6 +94,7 @@ ccl_device_inline float fminf(float a, float b)
 #ifndef __KERNEL_GPU__
 using std::isfinite;
 using std::isnan;
+using std::sqrt;
 
 ccl_device_inline int abs(int x)
 {
@@ -223,7 +224,7 @@ ccl_device_inline bool isfinite_safe(float f)
 {
 	/* By IEEE 754 rule, 2*Inf equals Inf */
 	unsigned int x = __float_as_uint(f);
-	return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
+	return (f == f) && (x == 0 || x == (1u << 31) || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
 }
 
 ccl_device_inline float ensure_finite(float v)
@@ -329,15 +330,22 @@ template<class A, class B> A lerp(const A& a, const A& b, const B& t)
 	return (A)(a * ((B)1 - t) + b * t);
 }
 
+#endif  /* __KERNEL_OPENCL__ */
+
 /* Triangle */
 
+#ifndef __KERNEL_OPENCL__
 ccl_device_inline float triangle_area(const float3& v1,
                                       const float3& v2,
                                       const float3& v3)
+#else
+ccl_device_inline float triangle_area(const float3 v1,
+                                      const float3 v2,
+                                      const float3 v3)
+#endif
 {
 	return len(cross(v3 - v2, v1 - v2))*0.5f;
 }
-#endif  /* __KERNEL_OPENCL__ */
 
 /* Orthonormal vectors */
 
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index bb04c4aa2d9..e73e5bc17a2 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -108,8 +108,7 @@ ccl_device_inline float3 operator*(const float3& a, const float f)
 
 ccl_device_inline float3 operator*(const float f, const float3& a)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
+#if defined(__KERNEL_SSE__)
 	return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
 #else
 	return make_float3(a.x*f, a.y*f, a.z*f);
@@ -118,10 +117,8 @@ ccl_device_inline float3 operator*(const float f, const float3& a)
 
 ccl_device_inline float3 operator/(const float f, const float3& a)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(a.m128);
-	return float3(_mm_mul_ps(_mm_set1_ps(f),rc));
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
 #else
 	return make_float3(f / a.x, f / a.y, f / a.z);
 #endif
@@ -135,10 +132,8 @@ ccl_device_inline float3 operator/(const float3& a, const float f)
 
 ccl_device_inline float3 operator/(const float3& a, const float3& b)
 {
-	/* TODO(sergey): Currently disabled, gives speedup but causes precision issues. */
-#if defined(__KERNEL_SSE__) && 0
-	__m128 rc = _mm_rcp_ps(b.m128);
-	return float3(_mm_mul_ps(a, rc));
+#if defined(__KERNEL_SSE__)
+	return float3(_mm_div_ps(a.m128, b.m128));
 #else
 	return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 #endif
@@ -282,9 +277,8 @@ ccl_device_inline float3 mix(const float3& a, const float3& b, float t)
 ccl_device_inline float3 rcp(const float3& a)
 {
 #ifdef __KERNEL_SSE__
-	const float4 r(_mm_rcp_ps(a.m128));
-	return float3(_mm_sub_ps(_mm_add_ps(r, r),
-	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
 	return make_float3(1.0f/a.x, 1.0f/a.y, 1.0f/a.z);
 #endif
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index d89121b3a1d..aa7e56fefe9 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -48,23 +48,30 @@ ccl_device_inline bool operator==(const float4& a, const float4& b);
 ccl_device_inline float dot(const float4& a, const float4& b);
 ccl_device_inline float len_squared(const float4& a);
 ccl_device_inline float4 rcp(const float4& a);
+ccl_device_inline float4 sqrt(const float4& a);
+ccl_device_inline float4 sqr(const float4& a);
 ccl_device_inline float4 cross(const float4& a, const float4& b);
 ccl_device_inline bool is_zero(const float4& a);
-ccl_device_inline float reduce_add(const float4& a);
 ccl_device_inline float average(const float4& a);
 ccl_device_inline float len(const float4& a);
 ccl_device_inline float4 normalize(const float4& a);
 ccl_device_inline float4 safe_normalize(const float4& a);
 ccl_device_inline float4 min(const float4& a, const float4& b);
 ccl_device_inline float4 max(const float4& a, const float4& b);
+ccl_device_inline float4 fabs(const float4& a);
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4& b);
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b);
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b);
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b);
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b);
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b);
 template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b);
@@ -77,9 +84,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b);
 ccl_device_inline float4 reduce_min(const float4& a);
 ccl_device_inline float4 reduce_max(const float4& a);
-#  if 0
 ccl_device_inline float4 reduce_add(const float4& a);
-#  endif
 #endif  /* !__KERNEL_GPU__ */
 
 /*******************************************************************************
@@ -128,7 +133,7 @@ ccl_device_inline float4 operator/(const float4& a, float f)
 ccl_device_inline float4 operator/(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	return a * rcp(b);
+	return float4(_mm_div_ps(a.m128, b.m128));
 #else
 	return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
 #endif
@@ -171,8 +176,7 @@ ccl_device_inline float4 operator/=(float4& a, float f)
 ccl_device_inline int4 operator<(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	/* TODO(sergey): avoid cvt. */
-	return int4(_mm_cvtps_epi32(_mm_cmplt_ps(a.m128, b.m128)));
+	return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
 #else
 	return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
 #endif
@@ -181,8 +185,7 @@ ccl_device_inline int4 operator<(const float4& a, const float4& b)
 ccl_device_inline int4 operator>=(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	/* TODO(sergey): avoid cvt. */
-	return int4(_mm_cvtps_epi32(_mm_cmpge_ps(a.m128, b.m128)));
+	return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
 #else
 	return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
 #endif
@@ -191,8 +194,7 @@ ccl_device_inline int4 operator>=(const float4& a, const float4& b)
 ccl_device_inline int4 operator<=(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	/* TODO(sergey): avoid cvt. */
-	return int4(_mm_cvtps_epi32(_mm_cmple_ps(a.m128, b.m128)));
+	return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
 #else
 	return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
 #endif
@@ -224,14 +226,30 @@ ccl_device_inline float len_squared(const float4& a)
 ccl_device_inline float4 rcp(const float4& a)
 {
 #ifdef __KERNEL_SSE__
-	float4 r(_mm_rcp_ps(a.m128));
-	return float4(_mm_sub_ps(_mm_add_ps(r, r),
-	                         _mm_mul_ps(_mm_mul_ps(r, r), a)));
+	/* Don't use _mm_rcp_ps due to poor precision. */
+	return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
 #else
 	return make_float4(1.0f/a.x, 1.0f/a.y, 1.0f/a.z, 1.0f/a.w);
 #endif
 }
 
+ccl_device_inline float4 sqrt(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_sqrt_ps(a.m128));
+#else
+	return make_float4(sqrtf(a.x),
+	                   sqrtf(a.y),
+	                   sqrtf(a.z),
+	                   sqrtf(a.w));
+#endif
+}
+
+ccl_device_inline float4 sqr(const float4& a)
+{
+	return a * a;
+}
+
 ccl_device_inline float4 cross(const float4& a, const float4& b)
 {
 #ifdef __KERNEL_SSE__
@@ -254,20 +272,25 @@ ccl_device_inline bool is_zero(const float4& a)
 #endif
 }
 
-ccl_device_inline float reduce_add(const float4& a)
+ccl_device_inline float4 reduce_add(const float4& a)
 {
 #ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_SSE3__
+    float4 h(_mm_hadd_ps(a.m128, a.m128));
+    return float4( _mm_hadd_ps(h.m128, h.m128));
+#  else
 	float4 h(shuffle<1,0,3,2>(a) + a);
-	/* TODO(sergey): Investigate efficiency. */
-	return _mm_cvtss_f32(shuffle<2,3,0,1>(h) + h);
+	return  shuffle<2,3,0,1>(h) + h;
+#  endif
 #else
-	return ((a.x + a.y) + (a.z + a.w));
+	float sum = (a.x + a.y) + (a.z + a.w);
+	return make_float4(sum, sum, sum, sum);
 #endif
 }
 
 ccl_device_inline float average(const float4& a)
 {
-	return reduce_add(a) * 0.25f;
+	return reduce_add(a).x * 0.25f;
 }
 
 ccl_device_inline float len(const float4& a)
@@ -309,6 +332,18 @@ ccl_device_inline float4 max(const float4& a, const float4& b)
 	                   max(a.w, b.w));
 #endif
 }
+
+ccl_device_inline float4 fabs(const float4& a)
+{
+#ifdef __KERNEL_SSE__
+	return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#else
+	return make_float4(fabsf(a.x),
+	                   fabsf(a.y),
+	                   fabsf(a.z),
+	                   fabsf(a.w));
+#endif
+}
 #endif  /* !__KERNEL_OPENCL__*/
 
 #ifdef __KERNEL_SSE__
@@ -320,11 +355,28 @@ __forceinline const float4 shuffle(const float4& b)
 	                          _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
 }
 
+template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
+__forceinline const float4 shuffle(const float4& a, const float4& b)
+{
+	return float4(_mm_shuffle_ps(a.m128, b.m128,
+	                             _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
+}
+
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
 {
 	return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
 }
 
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& a, const float4& b)
+{
+	return float4(_mm_movelh_ps(a.m128, b.m128));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4& a, const float4& b)
+{
+	return float4(_mm_movehl_ps(b.m128, a.m128));
+}
+
 #  ifdef __KERNEL_SSE3__
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
 {
@@ -344,9 +396,7 @@ ccl_device_inline float4 select(const int4& mask,
                                 const float4& b)
 {
 #ifdef __KERNEL_SSE__
-	/* TODO(sergey): avoid cvt. */
-	return float4(_mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(mask), a),
-	                        _mm_andnot_ps(_mm_cvtepi32_ps(mask), b)));
+	return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
 #else
 	return make_float4((mask.x)? a.x: b.x,
 	                   (mask.y)? a.y: b.y,
@@ -355,6 +405,13 @@ ccl_device_inline float4 select(const int4& mask,
 #endif
 }
 
+ccl_device_inline float4 mask(const int4& mask,
+                              const float4& a)
+{
+	/* Replace elements of x with zero where mask isn't set. */
+	return select(mask, a, make_float4(0.0f));
+}
+
 ccl_device_inline float4 reduce_min(const float4& a)
 {
 #ifdef __KERNEL_SSE__
@@ -375,17 +432,15 @@ ccl_device_inline float4 reduce_max(const float4& a)
 #endif
 }
 
-#if 0
-ccl_device_inline float4 reduce_add(const float4& a)
+ccl_device_inline float4 load_float4(const float *v)
 {
 #ifdef __KERNEL_SSE__
-	float4 h = shuffle<1,0,3,2>(a) + a;
-	return shuffle<2,3,0,1>(h) + h;
+	return float4(_mm_loadu_ps(v));
 #else
-	return make_float4((a.x + a.y) + (a.z + a.w));
+	return make_float4(v[0], v[1], v[2], v[3]);
 #endif
 }
-#endif
+
 #endif  /* !__KERNEL_GPU__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index c7511f8306e..b31dbe4fc67 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -223,20 +223,20 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 {
 	const float singular_epsilon = 1e-9f;
 
-	for (int row = 0; row < n; row++) {
-		for (int col = 0; col < n; col++) {
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col < n; col++) {
 			MATS(V, n, row, col, v_stride) = (col == row) ? 1.0f : 0.0f;
 		}
 	}
 
-	for (int sweep = 0; sweep < 8; sweep++) {
+	for(int sweep = 0; sweep < 8; sweep++) {
 		float off_diagonal = 0.0f;
-		for (int row = 1; row < n; row++) {
-			for (int col = 0; col < row; col++) {
+		for(int row = 1; row < n; row++) {
+			for(int col = 0; col < row; col++) {
 				off_diagonal += fabsf(MAT(A, n, row, col));
 			}
 		}
-		if (off_diagonal < 1e-7f) {
+		if(off_diagonal < 1e-7f) {
 			/* The matrix has nearly reached diagonal form.
 			 * Since the eigenvalues are only used to determine truncation, their exact values aren't required - a relative error of a few ULPs won't matter at all. */
 			break;
@@ -253,7 +253,7 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 				float abs_element = fabsf(element);
 
 				/* If we're in a later sweep and the element already is very small, just set it to zero and skip the rotation. */
-				if (sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
+				if(sweep > 3 && abs_element <= singular_epsilon*fabsf(MAT(A, n, row, row)) && abs_element <= singular_epsilon*fabsf(MAT(A, n, col, col))) {
 					MAT(A, n, row, col) = 0.0f;
 					continue;
 				}
@@ -272,10 +272,10 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 				 * Then, we compute sin(phi) and cos(phi) themselves. */
 				float singular_diff = MAT(A, n, row, row) - MAT(A, n, col, col);
 				float ratio;
-				if (abs_element > singular_epsilon*fabsf(singular_diff)) {
+				if(abs_element > singular_epsilon*fabsf(singular_diff)) {
 					float cot_2phi = 0.5f*singular_diff / element;
 					ratio = 1.0f / (fabsf(cot_2phi) + sqrtf(1.0f + cot_2phi*cot_2phi));
-					if (cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
+					if(cot_2phi < 0.0f) ratio = -ratio; /* Copy sign. */
 				}
 				else {
 					ratio = element / singular_diff;
@@ -315,21 +315,21 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 	}
 
 	/* Sort eigenvalues and the associated eigenvectors. */
-	for (int i = 0; i < n - 1; i++) {
+	for(int i = 0; i < n - 1; i++) {
 		float v = MAT(A, n, i, i);
 		int k = i;
-		for (int j = i; j < n; j++) {
-			if (MAT(A, n, j, j) >= v) {
+		for(int j = i; j < n; j++) {
+			if(MAT(A, n, j, j) >= v) {
 				v = MAT(A, n, j, j);
 				k = j;
 			}
 		}
-		if (k != i) {
+		if(k != i) {
 			/* Swap eigenvalues. */
 			MAT(A, n, k, k) = MAT(A, n, i, i);
 			MAT(A, n, i, i) = v;
 			/* Swap eigenvectors. */
-			for (int j = 0; j < n; j++) {
+			for(int j = 0; j < n; j++) {
 				float v = MATS(V, n, i, j, v_stride);
 				MATS(V, n, i, j, v_stride) = MATS(V, n, k, j, v_stride);
 				MATS(V, n, k, j, v_stride) = v;
@@ -339,59 +339,59 @@ ccl_device void math_matrix_jacobi_eigendecomposition(float *A, ccl_global float
 }
 
 #ifdef __KERNEL_SSE3__
-ccl_device_inline void math_vector_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_vector_zero_sse(float4 *A, int n)
 {
 	for(int i = 0; i < n; i++) {
-		A[i] = _mm_setzero_ps();
+		A[i] = make_float4(0.0f);
 	}
 }
 
-ccl_device_inline void math_matrix_zero_sse(__m128 *A, int n)
+ccl_device_inline void math_matrix_zero_sse(float4 *A, int n)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_setzero_ps();
+			MAT(A, n, row, col) = make_float4(0.0f);
 		}
 	}
 }
 
 /* Add Gramian matrix of v to A.
  * The Gramian matrix of v is v^T*v, so element (i,j) is v[i]*v[j]. */
-ccl_device_inline void math_matrix_add_gramian_sse(__m128 *A, int n, const __m128 *ccl_restrict v, __m128 weight)
+ccl_device_inline void math_matrix_add_gramian_sse(float4 *A, int n, const float4 *ccl_restrict v, float4 weight)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_add_ps(MAT(A, n, row, col), _mm_mul_ps(_mm_mul_ps(v[row], v[col]), weight));
+			MAT(A, n, row, col) = MAT(A, n, row, col) + v[row] * v[col] * weight;
 		}
 	}
 }
 
-ccl_device_inline void math_vector_add_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_add_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_add_ps(V[i], a[i]);
+		V[i] += a[i];
 	}
 }
 
-ccl_device_inline void math_vector_mul_sse(__m128 *V, int n, const __m128 *ccl_restrict a)
+ccl_device_inline void math_vector_mul_sse(float4 *V, int n, const float4 *ccl_restrict a)
 {
 	for(int i = 0; i < n; i++) {
-		V[i] = _mm_mul_ps(V[i], a[i]);
+		V[i] *= a[i];
 	}
 }
 
-ccl_device_inline void math_vector_max_sse(__m128 *a, const __m128 *ccl_restrict b, int n)
+ccl_device_inline void math_vector_max_sse(float4 *a, const float4 *ccl_restrict b, int n)
 {
 	for(int i = 0; i < n; i++) {
-		a[i] = _mm_max_ps(a[i], b[i]);
+		a[i] = max(a[i], b[i]);
 	}
 }
 
-ccl_device_inline void math_matrix_hsum(float *A, int n, const __m128 *ccl_restrict B)
+ccl_device_inline void math_matrix_hsum(float *A, int n, const float4 *ccl_restrict B)
 {
 	for(int row = 0; row < n; row++) {
 		for(int col = 0; col <= row; col++) {
-			MAT(A, n, row, col) = _mm_hsum_ss(MAT(B, n, row, col));
+			MAT(A, n, row, col) = reduce_add(MAT(B, n, row, col))[0];
 		}
 	}
 }
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 6f70a474fe7..3c5785c4807 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -19,22 +19,15 @@
 
 #ifndef __KERNEL_GPU__
 
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)  || \
-	defined(__KERNEL_SSE3__)  || \
-	defined(__KERNEL_SSSE3__) || \
-	defined(__KERNEL_SSE41__) || \
-	defined(__KERNEL_AVX__)   || \
-	defined(__KERNEL_AVX2__)
-	/* do nothing */
-#endif
-
 /* x86
  *
  * Compile a regular, SSE2 and SSE3 kernel. */
 
 #if defined(i386) || defined(_M_IX86)
 
+/* We require minimum SSE2 support on x86, so auto enable. */
+#  define __KERNEL_SSE2__
+
 #  ifdef WITH_KERNEL_SSE2
 #    define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #  endif
@@ -73,48 +66,6 @@
 
 #endif  /* defined(__x86_64__) || defined(_M_X64) */
 
-/* SSE Experiment
- *
- * This is disabled code for an experiment to use SSE types globally for types
- * such as float3 and float4. Currently this gives an overall slowdown. */
-
-#if 0
-#  define __KERNEL_SSE__
-#  ifndef __KERNEL_SSE2__
-#    define __KERNEL_SSE2__
-#  endif
-#  ifndef __KERNEL_SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifndef __KERNEL_SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifndef __KERNEL_SSE4__
-#    define __KERNEL_SSE4__
-#  endif
-#endif
-
-/* SSE Intrinsics includes
- *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
-
-/* SSE intrinsics headers */
-#ifndef FREE_WINDOWS64
-
-#ifdef _MSC_VER
-#  include <intrin.h>
-#elif (defined(__x86_64__) || defined(__i386__))
-#  include <x86intrin.h>
-#endif
-
-#else
-
-/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
- * Since we can't avoid including <windows.h>, better only include that */
-#include "util/util_windows.h"
-
-#endif
-
 #endif
 
 #endif /* __UTIL_OPTIMIZATION_H__ */
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index f9c3b4bb139..bae5d5bd6d1 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -45,6 +45,7 @@ OIIO_NAMESPACE_USING
 #  include <shlwapi.h>
 #endif
 
+#include "util/util_map.h"
 #include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
@@ -768,68 +769,180 @@ bool path_remove(const string& path)
 	return remove(path.c_str()) == 0;
 }
 
-static string line_directive(const string& base, const string& path, int line)
+struct SourceReplaceState {
+	typedef map<string, string> ProcessedMapping;
+	/* Base director for all relative include headers. */
+	string base;
+	/* Result of processed files. */
+	ProcessedMapping processed_files;
+	/* Set of files which are considered "precompiled" and which are replaced
+	 * with and empty string on a subsequent occurrence in include statement.
+	 */
+	set<string> precompiled_headers;
+};
+
+static string path_source_replace_includes_recursive(
+        const string& source,
+        const string& source_filepath,
+        SourceReplaceState *state);
+
+static string line_directive(const SourceReplaceState& state,
+                             const string& path,
+                             const int line)
 {
-	string escaped_path = path;
+	string unescaped_path = path;
 	/* First we make path relative. */
-	if(string_startswith(escaped_path, base.c_str())) {
-		const string base_file = path_filename(base);
-		const size_t base_len = base.length();
-		escaped_path = base_file + escaped_path.substr(base_len,
-		                                               escaped_path.length() - base_len);
+	if(string_startswith(unescaped_path, state.base.c_str())) {
+		const string base_file = path_filename(state.base);
+		const size_t base_len = state.base.length();
+		unescaped_path = base_file +
+		        unescaped_path.substr(base_len,
+		                            unescaped_path.length() - base_len);
 	}
 	/* Second, we replace all unsafe characters. */
-	string_replace(escaped_path, "\"", "\\\"");
-	string_replace(escaped_path, "\'", "\\\'");
-	string_replace(escaped_path, "\?", "\\\?");
-	string_replace(escaped_path, "\\", "\\\\");
+	const size_t length = unescaped_path.length();
+	string escaped_path = "";
+	for(size_t i = 0; i < length; ++i) {
+		const char ch = unescaped_path[i];
+		if(strchr("\"\'\?\\", ch) != NULL) {
+			escaped_path += "\\";
+		}
+		escaped_path += ch;
+	}
+	/* TODO(sergey): Check whether using std::to_string combined with several
+	 * concatenation operations is any faster.
+	 */
 	return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
 }
 
+static string path_source_handle_preprocessor(
+        const string& preprocessor_line,
+        const string& source_filepath,
+        const size_t line_number,
+        SourceReplaceState *state)
+{
+	string result = preprocessor_line;
+	string token = string_strip(
+	        preprocessor_line.substr(1, preprocessor_line.size() - 1));
+	if(string_startswith(token, "include")) {
+		token = string_strip(token.substr(7, token.size() - 7));
+		if(token[0] == '"') {
+			const size_t n_start = 1;
+			const size_t n_end = token.find("\"", n_start);
+			const string filename = token.substr(n_start, n_end - n_start);
+			const bool is_precompiled = string_endswith(token, "// PRECOMPILED");
+			string filepath = path_join(state->base, filename);
+			if(!path_exists(filepath)) {
+				filepath = path_join(path_dirname(source_filepath),
+				                     filename);
+			}
+			if(is_precompiled) {
+				state->precompiled_headers.insert(filepath);
+			}
+			string text;
+			if(path_read_text(filepath, text)) {
+				text = path_source_replace_includes_recursive(
+				        text, filepath, state);
+				/* Use line directives for better error messages. */
+				result = line_directive(*state, filepath, 1) + "\n"
+				     + text + "\n"
+				     + line_directive(*state, source_filepath, line_number + 1);
+			}
+		}
+	}
+	return result;
+}
+
+/* Our own little c preprocessor that replaces #includes with the file
+ * contents, to work around issue of OpenCL drivers not supporting
+ * include paths with spaces in them.
+ */
 static string path_source_replace_includes_recursive(
-        const string& base,
         const string& source,
-        const string& source_filepath)
+        const string& source_filepath,
+        SourceReplaceState *state)
 {
-	/* Our own little c preprocessor that replaces #includes with the file
-	 * contents, to work around issue of OpenCL drivers not supporting
-	 * include paths with spaces in them.
+	/* Try to re-use processed file without spending time on replacing all
+	 * include directives again.
 	 */
-
+	SourceReplaceState::ProcessedMapping::iterator replaced_file =
+	        state->processed_files.find(source_filepath);
+	if(replaced_file != state->processed_files.end()) {
+		if(state->precompiled_headers.find(source_filepath) !=
+		        state->precompiled_headers.end()) {
+			return "";
+		}
+		return replaced_file->second;
+	}
+	/* Perform full file processing. */
 	string result = "";
-	vector<string> lines;
-	string_split(lines, source, "\n", false);
-
-	for(size_t i = 0; i < lines.size(); ++i) {
-		string line = lines[i];
-		if(line[0] == '#') {
-			string token = string_strip(line.substr(1, line.size() - 1));
-			if(string_startswith(token, "include")) {
-				token = string_strip(token.substr(7, token.size() - 7));
-				if(token[0] == '"') {
-					const size_t n_start = 1;
-					const size_t n_end = token.find("\"", n_start);
-					const string filename = token.substr(n_start, n_end - n_start);
-					string filepath = path_join(base, filename);
-					if(!path_exists(filepath)) {
-						filepath = path_join(path_dirname(source_filepath),
-						                     filename);
-					}
-					string text;
-					if(path_read_text(filepath, text)) {
-						text = path_source_replace_includes_recursive(
-						        base, text, filepath);
-						/* Use line directives for better error messages. */
-						line = line_directive(base, filepath, 1)
-						     + token.replace(0, n_end + 1, "\n" + text + "\n")
-						     + line_directive(base, source_filepath, i + 1);
-					}
-				}
+	const size_t source_length = source.length();
+	size_t index = 0;
+	/* Information about where we are in the source. */
+	size_t line_number = 0, column_number = 1;
+	/* Currently gathered non-preprocessor token.
+	 * Store as start/length rather than token itself to avoid overhead of
+	 * memory re-allocations on each character concatenation.
+	 */
+	size_t token_start = 0, token_length = 0;
+	/* Denotes whether we're inside of preprocessor line, together with
+	 * preprocessor line itself.
+	 *
+	 * TODO(sergey): Investigate whether using token start/end position
+	 * gives measurable speedup.
+	 */
+	bool inside_preprocessor = false;
+	string preprocessor_line = "";
+	/* Actual loop over the whole source. */
+	while(index < source_length) {
+		const char ch = source[index];
+		if(ch == '\n') {
+			if(inside_preprocessor) {
+				result += path_source_handle_preprocessor(preprocessor_line,
+				                                          source_filepath,
+				                                          line_number,
+				                                          state);
+				/* Start gathering net part of the token. */
+				token_start = index;
+				token_length = 0;
+			}
+			inside_preprocessor = false;
+			preprocessor_line = "";
+			column_number = 0;
+			++line_number;
+		}
+		else if(ch == '#' && column_number == 1 && !inside_preprocessor) {
+			/* Append all possible non-preprocessor token to the result. */
+			if(token_length != 0) {
+				result.append(source, token_start, token_length);
+				token_start = index;
+				token_length = 0;
 			}
+			inside_preprocessor = true;
+		}
+		if(inside_preprocessor) {
+			preprocessor_line += ch;
+		}
+		else {
+			++token_length;
 		}
-		result += line + "\n";
+		++index;
+		++column_number;
 	}
-
+	/* Append possible tokens which happened before special events handled
+	 * above.
+	 */
+	if(token_length != 0) {
+		result.append(source, token_start, token_length);
+	}
+	if(inside_preprocessor) {
+		result += path_source_handle_preprocessor(preprocessor_line,
+		                                          source_filepath,
+		                                          line_number,
+		                                          state);
+	}
+	/* Store result for further reuse. */
+	state->processed_files[source_filepath] = result;
 	return result;
 }
 
@@ -837,10 +950,12 @@ string path_source_replace_includes(const string& source,
                                     const string& path,
                                     const string& source_filename)
 {
+	SourceReplaceState state;
+	state.base = path;
 	return path_source_replace_includes_recursive(
-	        path,
 	        source,
-	        path_join(path, source_filename));
+	        path_join(path, source_filename),
+	        &state);
 }
 
 FILE *path_fopen(const string& path, const string& mode)
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index cd4fe52fdc9..134383e88db 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -41,6 +41,7 @@ public:
 		denoised_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
+		end_time = 0.0;
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
@@ -80,6 +81,7 @@ public:
 		denoised_tiles = 0;
 		start_time = time_dt();
 		render_start_time = time_dt();
+		end_time = 0.0;
 		status = "Initializing";
 		substatus = "";
 		sync_status = "";
@@ -146,6 +148,7 @@ public:
 		thread_scoped_lock lock(progress_mutex);
 
 		start_time = time_dt();
+		end_time = 0.0;
 	}
 
 	void set_render_start_time()
@@ -169,8 +172,15 @@ public:
 	{
 		thread_scoped_lock lock(progress_mutex);
 
-		total_time_ = time_dt() - start_time;
-		render_time_ = time_dt() - render_start_time;
+		double time = (end_time > 0) ? end_time : time_dt();
+
+		total_time_ = time - start_time;
+		render_time_ = time - render_start_time;
+	}
+
+	void set_end_time()
+	{
+		end_time = time_dt();
 	}
 
 	void reset_sample()
@@ -337,6 +347,8 @@ protected:
 	int rendered_tiles, denoised_tiles;
 
 	double start_time, render_start_time;
+	/* End time written when render is done, so it doesn't keep increasing on redraws. */
+	double end_time;
 
 	string status;
 	string substatus;
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 587febe3e52..58b3d267266 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -18,19 +18,38 @@
 #ifndef __UTIL_SIMD_TYPES_H__
 #define __UTIL_SIMD_TYPES_H__
 
+#ifndef __KERNEL_GPU__
+
 #include <limits>
 
 #include "util/util_debug.h"
-#include "util/util_types.h"
+#include "util/util_defines.h"
+
+/* SSE Intrinsics includes
+ *
+ * We assume __KERNEL_SSEX__ flags to have been defined at this point */
+
+/* SSE intrinsics headers */
+#ifndef FREE_WINDOWS64
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#elif (defined(__x86_64__) || defined(__i386__))
+#  include <x86intrin.h>
+#endif
+
+#else
+
+/* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
+ * Since we can't avoid including <windows.h>, better only include that */
+#include "util/util_windows.h"
+
+#endif
 
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
-struct sseb;
-struct ssei;
-struct ssef;
-
 extern const __m128 _mm_lookupmask_ps[16];
 
 /* Special Types */
@@ -328,12 +347,12 @@ __forceinline size_t __bscf(size_t& v)
 
 #endif /* _WIN32 */
 
-static const unsigned int BITSCAN_NO_BIT_SET_32 = 32;
-static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
+/* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
+ * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
+ * platforms when compiling code outside the kernel. */
+#if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
 
-#ifdef __KERNEL_SSE3__
-/* Emulation of SSE4 functions with SSE3 */
-#  ifndef __KERNEL_SSE41__
+/* Emulation of SSE4 functions with SSE2 */
 
 #define _MM_FROUND_TO_NEAREST_INT    0x00
 #define _MM_FROUND_TO_NEG_INF        0x01
@@ -342,50 +361,50 @@ static const size_t       BITSCAN_NO_BIT_SET_64 = 64;
 #define _MM_FROUND_CUR_DIRECTION     0x04
 
 #undef _mm_blendv_ps
-#define _mm_blendv_ps __emu_mm_blendv_ps
-__forceinline __m128 _mm_blendv_ps( __m128 value, __m128 input, __m128 mask ) { 
-    return _mm_or_ps(_mm_and_ps(mask, input), _mm_andnot_ps(mask, value)); 
+#define _mm_blendv_ps _mm_blendv_ps_emu
+__forceinline __m128 _mm_blendv_ps_emu( __m128 value, __m128 input, __m128 mask)
+{
+    __m128i isignmask = _mm_set1_epi32(0x80000000);
+    __m128 signmask = _mm_castsi128_ps(isignmask);
+    __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
+    __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
+    __m128 cmpmask = _mm_castsi128_ps(icmpmask);
+    return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
 }
 
 #undef _mm_blend_ps
-#define _mm_blend_ps __emu_mm_blend_ps
-__forceinline __m128 _mm_blend_ps( __m128 value, __m128 input, const int mask ) { 
+#define _mm_blend_ps _mm_blend_ps_emu
+__forceinline __m128 _mm_blend_ps_emu( __m128 value, __m128 input, const int mask)
+{
     assert(mask < 0x10); return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]); 
 }
 
 #undef _mm_blendv_epi8
-#define _mm_blendv_epi8 __emu_mm_blendv_epi8
-__forceinline __m128i _mm_blendv_epi8( __m128i value, __m128i input, __m128i mask ) { 
+#define _mm_blendv_epi8 _mm_blendv_epi8_emu
+__forceinline __m128i _mm_blendv_epi8_emu( __m128i value, __m128i input, __m128i mask)
+{
     return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value)); 
 }
 
-#undef _mm_mullo_epi32
-#define _mm_mullo_epi32 __emu_mm_mullo_epi32
-__forceinline __m128i _mm_mullo_epi32( __m128i value, __m128i input ) {
-  __m128i rvalue;
-  char* _r = (char*)(&rvalue + 1);
-  char* _v = (char*)(& value + 1);
-  char* _i = (char*)(& input + 1);
-  for( ssize_t i = -16 ; i != 0 ; i += 4 ) *((int32_t*)(_r + i)) = *((int32_t*)(_v + i))*  *((int32_t*)(_i + i));
-  return rvalue;
-}
-
 #undef _mm_min_epi32
-#define _mm_min_epi32 __emu_mm_min_epi32
-__forceinline __m128i _mm_min_epi32( __m128i value, __m128i input ) { 
+#define _mm_min_epi32 _mm_min_epi32_emu
+__forceinline __m128i _mm_min_epi32_emu( __m128i value, __m128i input)
+{
     return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_max_epi32
-#define _mm_max_epi32 __emu_mm_max_epi32
-__forceinline __m128i _mm_max_epi32( __m128i value, __m128i input ) { 
+#define _mm_max_epi32 _mm_max_epi32_emu
+__forceinline __m128i _mm_max_epi32_emu( __m128i value, __m128i input)
+{
     return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input)); 
 }
 
 #undef _mm_extract_epi32
-#define _mm_extract_epi32 __emu_mm_extract_epi32
-__forceinline int _mm_extract_epi32( __m128i input, const int index ) {
-  switch ( index ) {
+#define _mm_extract_epi32 _mm_extract_epi32_emu
+__forceinline int _mm_extract_epi32_emu( __m128i input, const int index)
+{
+  switch(index) {
   case 0: return _mm_cvtsi128_si32(input);
   case 1: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
   case 2: return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
@@ -395,27 +414,26 @@ __forceinline int _mm_extract_epi32( __m128i input, const int index ) {
 }
 
 #undef _mm_insert_epi32
-#define _mm_insert_epi32 __emu_mm_insert_epi32
-__forceinline __m128i _mm_insert_epi32( __m128i value, int input, const int index ) { 
+#define _mm_insert_epi32 _mm_insert_epi32_emu
+__forceinline __m128i _mm_insert_epi32_emu( __m128i value, int input, const int index)
+{
     assert(index >= 0 && index < 4); ((int*)&value)[index] = input; return value; 
 }
 
-#undef _mm_extract_ps
-#define _mm_extract_ps __emu_mm_extract_ps
-__forceinline int _mm_extract_ps( __m128 input, const int index ) {
-  int32_t* ptr = (int32_t*)&input; return ptr[index];
-}
-
 #undef _mm_insert_ps
-#define _mm_insert_ps __emu_mm_insert_ps
-__forceinline __m128 _mm_insert_ps( __m128 value, __m128 input, const int index )
-{ assert(index < 0x100); ((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6]; return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value); }
+#define _mm_insert_ps _mm_insert_ps_emu
+__forceinline __m128 _mm_insert_ps_emu( __m128 value, __m128 input, const int index)
+{
+	assert(index < 0x100);
+	((float*)&value)[(index >> 4)&0x3] = ((float*)&input)[index >> 6];
+	return _mm_andnot_ps(_mm_lookupmask_ps[index&0xf], value);
+}
 
 #undef _mm_round_ps
-#define _mm_round_ps __emu_mm_round_ps
-__forceinline __m128 _mm_round_ps( __m128 value, const int flags )
+#define _mm_round_ps _mm_round_ps_emu
+__forceinline __m128 _mm_round_ps_emu( __m128 value, const int flags)
 {
-  switch ( flags )
+  switch(flags)
   {
   case _MM_FROUND_TO_NEAREST_INT: return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
   case _MM_FROUND_TO_NEG_INF    : return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
@@ -425,57 +443,7 @@ __forceinline __m128 _mm_round_ps( __m128 value, const int flags )
   return value;
 }
 
-#    ifdef _M_X64
-#undef _mm_insert_epi64
-#define _mm_insert_epi64 __emu_mm_insert_epi64
-__forceinline __m128i _mm_insert_epi64( __m128i value, __int64 input, const int index ) { 
-    assert(size_t(index) < 4); ((__int64*)&value)[index] = input; return value; 
-}
-
-#undef _mm_extract_epi64
-#define _mm_extract_epi64 __emu_mm_extract_epi64
-__forceinline __int64 _mm_extract_epi64( __m128i input, const int index ) { 
-    assert(size_t(index) < 2); 
-    return index == 0 ? _mm_cvtsi128_si64x(input) : _mm_cvtsi128_si64x(_mm_unpackhi_epi64(input, input)); 
-}
-#    endif
-
-#  endif
-
-#undef _mm_fabs_ps
-#define _mm_fabs_ps(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
-
-/* Return a __m128 with every element set to the largest element of v. */
-ccl_device_inline __m128 _mm_hmax_ps(__m128 v)
-{
-  /* v[0, 1, 2, 3] => [0, 1, 0, 1] and [2, 3, 2, 3] => v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] */
-  v = _mm_max_ps(_mm_movehl_ps(v, v), _mm_movelh_ps(v, v));
-  /* v[max(0, 2), max(1, 3), max(0, 2), max(1, 3)] => [4 times max(1, 3)] and [4 times max(0, 2)] => v[4 times max(0, 1, 2, 3)] */
-  v = _mm_max_ps(_mm_movehdup_ps(v), _mm_moveldup_ps(v));
-  return v;
-}
-
-/* Return the sum of the four elements of x. */
-ccl_device_inline float _mm_hsum_ss(__m128 x)
-{
-    __m128 a = _mm_movehdup_ps(x);
-    __m128 b = _mm_add_ps(x, a);
-    return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(a, b), b));
-}
-
-/* Return a __m128 with every element set to the sum of the four elements of x. */
-ccl_device_inline __m128 _mm_hsum_ps(__m128 x)
-{
-    x = _mm_hadd_ps(x, x);
-    x = _mm_hadd_ps(x, x);
-    return x;
-}
-
-/* Replace elements of x with zero where mask isn't set. */
-#undef _mm_mask_ps
-#define _mm_mask_ps(x, mask) _mm_blendv_ps(_mm_setzero_ps(), x, mask)
-
-#endif
+#endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
 
 #else  /* __KERNEL_SSE2__ */
 
@@ -496,13 +464,19 @@ ccl_device_inline int bitscan(int value)
 
 #endif /* __KERNEL_SSE2__ */
 
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)  || \
+	defined(__KERNEL_SSE3__)  || \
+	defined(__KERNEL_SSSE3__) || \
+	defined(__KERNEL_SSE41__) || \
+	defined(__KERNEL_AVX__)   || \
+	defined(__KERNEL_AVX2__)
+	/* do nothing */
+#endif
+
 CCL_NAMESPACE_END
 
-#include "util/util_math.h"
-#include "util/util_sseb.h"
-#include "util/util_ssei.h"
-#include "util/util_ssef.h"
-#include "util/util_avxf.h"
+#endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_sseb.h b/intern/cycles/util/util_sseb.h
index 6e669701f3b..93c22aafdcd 100644
--- a/intern/cycles/util/util_sseb.h
+++ b/intern/cycles/util/util_sseb.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct ssei;
+struct ssef;
+
 /*! 4-wide SSE bool type. */
 struct sseb
 {
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index cf99a08efae..bb007ff84a9 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE float type. */
 struct ssef
 {
diff --git a/intern/cycles/util/util_ssei.h b/intern/cycles/util/util_ssei.h
index 5f62569268c..ef2a9e68b7d 100644
--- a/intern/cycles/util/util_ssei.h
+++ b/intern/cycles/util/util_ssei.h
@@ -22,6 +22,9 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_SSE2__
 
+struct sseb;
+struct ssef;
+
 /*! 4-wide SSE integer type. */
 struct ssei
 {
@@ -234,8 +237,10 @@ __forceinline size_t select_max(const sseb& valid, const ssei& v) { const ssei a
 
 #else
 
-__forceinline int reduce_min(const ssei& v) { return min(min(v[0],v[1]),min(v[2],v[3])); }
-__forceinline int reduce_max(const ssei& v) { return max(max(v[0],v[1]),max(v[2],v[3])); }
+__forceinline int ssei_min(int a, int b) { return (a < b)? a: b; }
+__forceinline int ssei_max(int a, int b) { return (a > b)? a: b; }
+__forceinline int reduce_min(const ssei& v) { return ssei_min(ssei_min(v[0],v[1]),ssei_min(v[2],v[3])); }
+__forceinline int reduce_max(const ssei& v) { return ssei_max(ssei_max(v[0],v[1]),ssei_max(v[2],v[3])); }
 __forceinline int reduce_add(const ssei& v) { return v[0]+v[1]+v[2]+v[3]; }
 
 #endif
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index baba549753d..7667f58eb7d 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -30,7 +30,7 @@ public:
 
 	void mem_alloc(size_t size) {
 		atomic_add_and_fetch_z(&mem_used, size);
-		atomic_update_max_z(&mem_peak, mem_used);
+		atomic_fetch_and_update_max_z(&mem_peak, mem_used);
 	}
 
 	void mem_free(size_t size) {
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index fb0c34e1dc4..6ed97b0e0a6 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -206,9 +206,9 @@ void TaskScheduler::init(int num_threads)
 		threads.resize(num_threads);
 
 		const int num_groups = system_cpu_group_count();
-		unsigned short num_process_groups;
+		unsigned short num_process_groups = 0;
 		vector<unsigned short> process_groups;
-		int current_group_threads;
+		int current_group_threads = 0;
 		if(num_groups > 1) {
 			process_groups.resize(num_groups);
 			num_process_groups = system_cpu_process_groups(num_groups, 
diff --git a/intern/cycles/util/util_time.h b/intern/cycles/util/util_time.h
index 65798244111..f03aa590e9b 100644
--- a/intern/cycles/util/util_time.h
+++ b/intern/cycles/util/util_time.h
@@ -37,7 +37,7 @@ public:
 	~scoped_timer()
 	{
 		if(value_ != NULL) {
-			*value_ = time_dt() - time_start_;
+			*value_ = get_time();
 		}
 	}
 
@@ -46,6 +46,11 @@ public:
 		return time_start_;
 	}
 
+	double get_time() const
+	{
+		return time_dt() - time_start_;
+	}
+
 protected:
 	double *value_;
 	double time_start_;
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a5d1d7152d5..aabca6c81fc 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -21,72 +21,18 @@
 #  include <stdlib.h>
 #endif
 
-/* Bitness */
+/* Standard Integer Types */
 
-#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64)
-#  define __KERNEL_64_BIT__
+#if !defined(__KERNEL_GPU__) && !defined(_WIN32)
+#  include <stdint.h>
 #endif
 
-/* Qualifiers for kernel code shared by CPU and GPU */
-
-#ifndef __KERNEL_GPU__
-#  define ccl_device static inline
-#  define ccl_device_noinline static
-#  define ccl_global
-#  define ccl_constant
-#  define ccl_local
-#  define ccl_local_param
-#  define ccl_private
-#  define ccl_restrict __restrict
-#  define __KERNEL_WITH_SSE_ALIGN__
-
-#  if defined(_WIN32) && !defined(FREE_WINDOWS)
-#    define ccl_device_inline static __forceinline
-#    define ccl_device_forceinline static __forceinline
-#    define ccl_align(...) __declspec(align(__VA_ARGS__))
-#    ifdef __KERNEL_64_BIT__
-#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#    else  /* __KERNEL_64_BIT__ */
-#      undef __KERNEL_WITH_SSE_ALIGN__
-/* No support for function arguments (error C2719). */
-#      define ccl_try_align(...)
-#    endif  /* __KERNEL_64_BIT__ */
-#    define ccl_may_alias
-#    define ccl_always_inline __forceinline
-#    define ccl_never_inline __declspec(noinline)
-#    define ccl_maybe_unused
-#  else  /* _WIN32 && !FREE_WINDOWS */
-#    define ccl_device_inline static inline __attribute__((always_inline))
-#    define ccl_device_forceinline static inline __attribute__((always_inline))
-#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#    ifndef FREE_WINDOWS64
-#      define __forceinline inline __attribute__((always_inline))
-#    endif
-#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#    define ccl_may_alias __attribute__((__may_alias__))
-#    define ccl_always_inline __attribute__((always_inline))
-#    define ccl_never_inline __attribute__((noinline))
-#    define ccl_maybe_unused __attribute__((used))
-#  endif  /* _WIN32 && !FREE_WINDOWS */
-
-/* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
-#  if defined(__GNUC__) && (__GNUC__ >= 7)  /* gcc7.0+ only */
-#    define ATTR_FALLTHROUGH __attribute__((fallthrough))
-#  else
-#    define ATTR_FALLTHROUGH ((void)0)
-#  endif
-#endif  /* __KERNEL_GPU__ */
-
-/* Standard Integer Types */
+#include "util/util_defines.h"
 
 #ifndef __KERNEL_GPU__
-/* int8_t, uint16_t, and friends */
-#  ifndef _WIN32
-#    include <stdint.h>
-#  endif
-/* SIMD Types */
 #  include "util/util_optimization.h"
-#endif  /* __KERNEL_GPU__ */
+#  include "util/util_simd.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
@@ -201,65 +147,8 @@ enum ExtensionType {
 	EXTENSION_NUM_TYPES,
 };
 
-/* macros */
-
-/* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
-#  define LIKELY(x)       __builtin_expect(!!(x), 1)
-#  define UNLIKELY(x)     __builtin_expect(!!(x), 0)
-#else
-#  define LIKELY(x)       (x)
-#  define UNLIKELY(x)     (x)
-#endif
-
-#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && _MSC_VER >= 1800))
-#  define HAS_CPP11_FEATURES
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#  if defined(HAS_CPP11_FEATURES)
-/* Some magic to be sure we don't have reference in the type. */
-template<typename T> static inline T decltype_helper(T x) { return x; }
-#    define TYPEOF(x) decltype(decltype_helper(x))
-#  else
-#    define TYPEOF(x) typeof(x)
-#  endif
-#endif
-
-/* Causes warning:
- * incompatible types when assigning to type 'Foo' from type 'Bar'
- * ... the compiler optimizes away the temp var */
-#ifdef __GNUC__
-#define CHECK_TYPE(var, type)  {  \
-	TYPEOF(var) *__tmp;           \
-	__tmp = (type *)NULL;         \
-	(void)__tmp;                  \
-} (void)0
-
-#define CHECK_TYPE_PAIR(var_a, var_b)  {  \
-	TYPEOF(var_a) *__tmp;                 \
-	__tmp = (typeof(var_b) *)NULL;        \
-	(void)__tmp;                          \
-} (void)0
-#else
-#  define CHECK_TYPE(var, type)
-#  define CHECK_TYPE_PAIR(var_a, var_b)
-#endif
-
-/* can be used in simple macros */
-#define CHECK_TYPE_INLINE(val, type) \
-	((void)(((type)0) != (val)))
-
-
 CCL_NAMESPACE_END
 
-#ifndef __KERNEL_GPU__
-#  include <cassert>
-#  define util_assert(statement)  assert(statement)
-#else
-#  define util_assert(statement)
-#endif
-
 /* Vectorized types declaration. */
 #include "util/util_types_uchar2.h"
 #include "util/util_types_uchar3.h"
@@ -298,5 +187,13 @@ CCL_NAMESPACE_END
 
 #include "util/util_types_vector3_impl.h"
 
+/* SSE types. */
+#ifndef __KERNEL_GPU__
+#  include "util/util_sseb.h"
+#  include "util/util_ssei.h"
+#  include "util/util_ssef.h"
+#  include "util/util_avxf.h"
+#endif
+
 #endif /* __UTIL_TYPES_H__ */