322 files changed, 9826 insertions, 7279 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 79c1c3e3e82..806a8660e8c 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -22,6 +22,7 @@ if(WITH_CYCLES_NATIVE_ONLY)
 		-DWITH_KERNEL_NATIVE
 	)
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+	set(CYCLES_KERNEL_FLAGS "-march=native")
 elseif(NOT WITH_CPU_SSE)
 	set(CXX_HAS_SSE FALSE)
 	set(CXX_HAS_AVX FALSE)
@@ -59,10 +60,13 @@ elseif(WIN32 AND MSVC)
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox")
 	set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox")
 	set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox")
+
+	set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 elseif(CMAKE_COMPILER_IS_GNUCC)
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
 	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
+	set(CYCLES_KERNEL_FLAGS "-ffast-math")
 	if(CXX_HAS_SSE)
 		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
@@ -74,10 +78,12 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
 	endif()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
 	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
+	set(CYCLES_KERNEL_FLAGS "-ffast-math")
 	if(CXX_HAS_SSE)
 		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
@@ -89,6 +95,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
 	endif()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
 endif()
 
 if(CXX_HAS_SSE)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 8cd499b7ca6..aabb8f63640 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -1,14 +1,6 @@
 
 set(INC
-	.
-	../bvh
-	../device
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../subd
-	../util
+	..
 )
 set(INC_SYS
 )
diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp
index 4ef9cd070bb..e65b9d769e4 100644
--- a/intern/cycles/app/cycles_server.cpp
+++ b/intern/cycles/app/cycles_server.cpp
@@ -16,15 +16,15 @@
 
 #include <stdio.h>
 
-#include "device.h"
-
-#include "util_args.h"
-#include "util_foreach.h"
-#include "util_path.h"
-#include "util_stats.h"
-#include "util_string.h"
-#include "util_task.h"
-#include "util_logging.h"
+#include "device/device.h"
+
+#include "util/util_args.h"
+#include "util/util_foreach.h"
+#include "util/util_path.h"
+#include "util/util_stats.h"
+#include "util/util_string.h"
+#include "util/util_task.h"
+#include "util/util_logging.h"
 
 using namespace ccl;
 
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 9816d614a7c..0cd249f0d84 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -16,29 +16,29 @@
 
 #include <stdio.h>
 
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "scene.h"
-#include "session.h"
-#include "integrator.h"
-
-#include "util_args.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_progress.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_transform.h"
-#include "util_version.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/integrator.h"
+
+#include "util/util_args.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_transform.h"
+#include "util/util_version.h"
 
 #ifdef WITH_CYCLES_STANDALONE_GUI
-#include "util_view.h"
+#include "util/util_view.h"
 #endif
 
-#include "cycles_xml.h"
+#include "app/cycles_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 35a30ae683f..04f00ef0e10 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -20,31 +20,31 @@
 #include <algorithm>
 #include <iterator>
 
-#include "node_xml.h"
-
-#include "background.h"
-#include "camera.h"
-#include "film.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "osl.h"
-#include "shader.h"
-#include "scene.h"
-
-#include "subd_patch.h"
-#include "subd_split.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_path.h"
-#include "util_transform.h"
-#include "util_xml.h"
-
-#include "cycles_xml.h"
+#include "graph/node_xml.h"
+
+#include "render/background.h"
+#include "render/camera.h"
+#include "render/film.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/shader.h"
+#include "render/scene.h"
+
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_path.h"
+#include "util/util_transform.h"
+#include "util/util_xml.h"
+
+#include "app/cycles_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index b57502b3b14..ae4977aaed0 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	../graph
-	../render
-	../device
-	../kernel
-	../kernel/svm
-	../util
-	../subd
+	..
 	../../glew-mx
 	../../guardedalloc
 	../../mikktspace
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 235d19e91e8..eb792af7264 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -107,7 +107,13 @@ def engine_exit():
     engine.exit()
 
 
+classes = (
+    CyclesRender,
+)
+
+
 def register():
+    from bpy.utils import register_class
     from . import ui
     from . import properties
     from . import presets
@@ -122,12 +128,15 @@ def register():
     properties.register()
     ui.register()
     presets.register()
-    bpy.utils.register_module(__name__)
+
+    for cls in classes:
+        register_class(cls)
 
     bpy.app.handlers.version_update.append(version_update.do_versions)
 
 
 def unregister():
+    from bpy.utils import unregister_class
     from . import ui
     from . import properties
     from . import presets
@@ -138,4 +147,6 @@ def unregister():
     ui.unregister()
     properties.unregister()
     presets.unregister()
-    bpy.utils.unregister_module(__name__)
+
+    for cls in classes:
+        unregister_class(cls)
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index c8c9ef58c52..ab57dd44bdb 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -50,6 +50,24 @@ def _workaround_buggy_drivers():
             _cycles.opencl_disable()
 
 
+def _configure_argument_parser():
+    import argparse
+    parser = argparse.ArgumentParser(description="Cycles Addon argument parser")
+    parser.add_argument("--cycles-resumable-num-chunks",
+                        help="Number of chunks to split sample range into",
+                        default=None)
+    parser.add_argument("--cycles-resumable-current-chunk",
+                        help="Current chunk of samples range to render",
+                        default=None)
+    parser.add_argument("--cycles-resumable-start-chunk",
+                        help="Start chunk to render",
+                        default=None)
+    parser.add_argument("--cycles-resumable-end-chunk",
+                        help="End chunk to render",
+                        default=None)
+    return parser
+
+
 def _parse_command_line():
     import sys
 
@@ -57,25 +75,22 @@ def _parse_command_line():
     if "--" not in argv:
         return
 
-    argv = argv[argv.index("--") + 1:]
-
-    num_resumable_chunks = None
-    current_resumable_chunk = None
-
-    # TODO(sergey): Add some nice error prints if argument is not used properly.
-    idx = 0
-    while idx < len(argv) - 1:
-        arg = argv[idx]
-        if arg == '--cycles-resumable-num-chunks':
-            num_resumable_chunks = int(argv[idx + 1])
-        elif arg == '--cycles-resumable-current-chunk':
-            current_resumable_chunk = int(argv[idx + 1])
-        idx += 1
+    parser = _configure_argument_parser()
+    args, unknown = parser.parse_known_args(argv[argv.index("--") + 1:])
 
-    if num_resumable_chunks is not None and current_resumable_chunk is not None:
-        import _cycles
-        _cycles.set_resumable_chunks(num_resumable_chunks,
-                                     current_resumable_chunk)
+    if args.cycles_resumable_num_chunks is not None:
+        if args.cycles_resumable_current_chunk is not None:
+            import _cycles
+            _cycles.set_resumable_chunk(
+                    int(args.cycles_resumable_num_chunks),
+                    int(args.cycles_resumable_current_chunk))
+        elif args.cycles_resumable_start_chunk is not None and \
+             args.cycles_resumable_end_chunk:
+            import _cycles
+            _cycles.set_resumable_chunk_range(
+                    int(args.cycles_resumable_num_chunks),
+                    int(args.cycles_resumable_start_chunk),
+                    int(args.cycles_resumable_end_chunk))
 
 
 def init():
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index f97b51b629d..82c4ffc6e50 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -82,12 +82,23 @@ class AddPresetSampling(AddPresetBase, Operator):
     preset_subdir = "cycles/sampling"
 
 
+classes = (
+    AddPresetIntegrator,
+    AddPresetSampling,
+)
+
+
 def register():
-    pass
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
 
 
 def unregister():
-    pass
+    from bpy.utils import unregister_class
+    for cls in classes:
+        unregister_class(cls)
+
 
 if __name__ == "__main__":
     register()
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5c51f9afc28..cbf469b3a89 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -665,8 +665,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
         cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
         cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
+        cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
         cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+        cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
         cls.debug_opencl_kernel_type = EnumProperty(
             name="OpenCL Kernel Type",
@@ -693,6 +695,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             update=devices_update_callback
             )
 
+        cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=False, update=devices_update_callback);
+
         cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)
 
     @classmethod
@@ -1092,6 +1096,12 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
                 default=1.0,
                 )
 
+        cls.is_shadow_catcher = BoolProperty(
+                name="Shadow Catcher",
+                description="Only render shadows on this object, for compositing renders into real footage",
+                default=False,
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Object.cycles
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 44af5f7efed..2b50d272be8 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -86,12 +86,10 @@ def use_sample_all_lights(context):
 
     return cscene.sample_all_lights_direct or cscene.sample_all_lights_indirect
 
-def show_device_selection(context):
-    type = get_device_type(context)
-    if type == 'NETWORK':
+def show_device_active(context):
+    cscene = context.scene.cycles
+    if cscene.device != 'GPU':
         return True
-    if not type in {'CUDA', 'OPENCL'}:
-        return False
     return context.user_preferences.addons[__package__].preferences.has_active_device()
 
 
@@ -186,9 +184,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.label(text="AA Samples:")
             sub.prop(cscene, "aa_samples", text="Render")
             sub.prop(cscene, "preview_aa_samples", text="Preview")
-            sub.separator()
-            sub.prop(cscene, "sample_all_lights_direct")
-            sub.prop(cscene, "sample_all_lights_indirect")
 
             col = split.column()
             sub = col.column(align=True)
@@ -205,6 +200,10 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.prop(cscene, "subsurface_samples", text="Subsurface")
             sub.prop(cscene, "volume_samples", text="Volume")
 
+            col = layout.column(align=True)
+            col.prop(cscene, "sample_all_lights_direct")
+            col.prop(cscene, "sample_all_lights_indirect")
+
         if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'):
             layout.row().prop(cscene, "sampling_pattern", text="Pattern")
 
@@ -270,7 +269,7 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
 
         row = col.row()
         row.prop(ccscene, "minimum_width", text="Min Pixels")
-        row.prop(ccscene, "maximum_width", text="Max Ext.")
+        row.prop(ccscene, "maximum_width", text="Max Extension")
 
 
 class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
@@ -788,6 +787,8 @@ class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
         if ob.type != 'LAMP':
             flow.prop(visibility, "shadow")
 
+        layout.prop(cob, "is_shadow_catcher")
+
         col = layout.column()
         col.label(text="Performance:")
         row = col.row()
@@ -1518,15 +1519,18 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         row.prop(cscene, "debug_use_cpu_avx", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
         col.prop(cscene, "debug_use_qbvh")
+        col.prop(cscene, "debug_use_cpu_split_kernel")
 
         col = layout.column()
         col.label('CUDA Flags:')
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
+        col.prop(cscene, "debug_use_cuda_split_kernel")
 
         col = layout.column()
         col.label('OpenCL Flags:')
         col.prop(cscene, "debug_opencl_kernel_type", text="Kernel")
         col.prop(cscene, "debug_opencl_device_type", text="Device")
+        col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
         col.prop(cscene, "debug_use_opencl_debug", text="Debug")
 
 
@@ -1633,7 +1637,7 @@ def draw_device(self, context):
         split = layout.split(percentage=1/3)
         split.label("Device:")
         row = split.row()
-        row.active = show_device_selection(context)
+        row.active = show_device_active(context)
         row.prop(cscene, "device", text="")
 
         if engine.with_osl() and use_cpu(context):
@@ -1712,17 +1716,75 @@ def get_panels():
 
     return panels
 
+
+classes = (
+    CYCLES_MT_sampling_presets,
+    CYCLES_MT_integrator_presets,
+    CyclesRender_PT_sampling,
+    CyclesRender_PT_geometry,
+    CyclesRender_PT_light_paths,
+    CyclesRender_PT_motion_blur,
+    CyclesRender_PT_film,
+    CyclesRender_PT_performance,
+    CyclesRender_PT_layer_options,
+    CyclesRender_PT_layer_passes,
+    CyclesRender_PT_views,
+    Cycles_PT_post_processing,
+    CyclesCamera_PT_dof,
+    Cycles_PT_context_material,
+    CyclesObject_PT_motion_blur,
+    CyclesObject_PT_cycles_settings,
+    CYCLES_OT_use_shading_nodes,
+    CyclesLamp_PT_preview,
+    CyclesLamp_PT_lamp,
+    CyclesLamp_PT_nodes,
+    CyclesLamp_PT_spot,
+    CyclesWorld_PT_preview,
+    CyclesWorld_PT_surface,
+    CyclesWorld_PT_volume,
+    CyclesWorld_PT_ambient_occlusion,
+    CyclesWorld_PT_mist,
+    CyclesWorld_PT_ray_visibility,
+    CyclesWorld_PT_settings,
+    CyclesMaterial_PT_preview,
+    CyclesMaterial_PT_surface,
+    CyclesMaterial_PT_volume,
+    CyclesMaterial_PT_displacement,
+    CyclesMaterial_PT_settings,
+    CyclesTexture_PT_context,
+    CyclesTexture_PT_node,
+    CyclesTexture_PT_mapping,
+    CyclesTexture_PT_colors,
+    CyclesParticle_PT_textures,
+    CyclesRender_PT_bake,
+    CyclesRender_PT_debug,
+    CyclesParticle_PT_CurveSettings,
+    CyclesScene_PT_simplify,
+)
+
+
 def register():
+    from bpy.utils import register_class
+
     bpy.types.RENDER_PT_render.append(draw_device)
     bpy.types.VIEW3D_HT_header.append(draw_pause)
 
     for panel in get_panels():
         panel.COMPAT_ENGINES.add('CYCLES')
 
+    for cls in classes:
+        register_class(cls)
+
+
 def unregister():
+    from bpy.utils import unregister_class
+
     bpy.types.RENDER_PT_render.remove(draw_device)
     bpy.types.VIEW3D_HT_header.remove(draw_pause)
 
     for panel in get_panels():
         if 'CYCLES' in panel.COMPAT_ENGINES:
             panel.COMPAT_ENGINES.remove('CYCLES')
+
+    for cls in classes:
+        unregister_class(cls)
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index f02fc553908..40d6b25f2b7 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "scene.h"
+#include "render/camera.h"
+#include "render/scene.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_logging.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index e42ff5d72a6..6fa038e8bf0 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#include "attribute.h"
-#include "camera.h"
-#include "curves.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
+#include "render/attribute.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -411,6 +411,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}
 
+	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -434,8 +435,8 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
-			numverts += (CData->curve_keynum[curve] - 2)*2*resolution + resolution;
-			numtris += (CData->curve_keynum[curve] - 2)*resolution;
+			numverts += (CData->curve_keynum[curve] - 1)*resolution + resolution;
+			numtris += (CData->curve_keynum[curve] - 1)*2*resolution;
 		}
 	}
 
@@ -545,6 +546,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 		}
 	}
 
+	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -890,7 +892,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	}
 
 	/* obtain general settings */
-	bool use_curves = scene->curve_system_manager->use_curves;
+	const bool use_curves = scene->curve_system_manager->use_curves;
 
 	if(!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) {
 		if(!motion)
@@ -898,11 +900,11 @@ void BlenderSync::sync_curves(Mesh *mesh,
 		return;
 	}
 
-	int primitive = scene->curve_system_manager->primitive;
-	int triangle_method = scene->curve_system_manager->triangle_method;
-	int resolution = scene->curve_system_manager->resolution;
-	size_t vert_num = mesh->verts.size();
-	size_t tri_num = mesh->num_triangles();
+	const int primitive = scene->curve_system_manager->primitive;
+	const int triangle_method = scene->curve_system_manager->triangle_method;
+	const int resolution = scene->curve_system_manager->resolution;
+	const size_t vert_num = mesh->verts.size();
+	const size_t tri_num = mesh->num_triangles();
 	int used_res = 1;
 
 	/* extract particle hair data - should be combined with connecting to mesh later*/
diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp
index f4f86929168..d0f82e37662 100644
--- a/intern/cycles/blender/blender_logging.cpp
+++ b/intern/cycles/blender/blender_logging.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "CCL_api.h"
-#include "util_logging.h"
+#include "blender/CCL_api.h"
+#include "util/util_logging.h"
 
 void CCL_init_logging(const char *argv0)
 {
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 85117cfff7b..e0e89cec65c 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -15,21 +15,22 @@
  */
 
  
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "camera.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/camera.h"
 
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
 
-#include "subd_patch.h"
-#include "subd_split.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_math.h"
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
 
 #include "mikktspace.h"
 
@@ -525,69 +526,177 @@ static void attr_create_uv_map(Scene *scene,
 }
 
 /* Create vertex pointiness attributes. */
+
+/* Compare vertices by sum of their coordinates. */
+class VertexAverageComparator {
+public:
+	VertexAverageComparator(const array<float3>& verts)
+	        : verts_(verts) {
+	}
+
+	bool operator()(const int& vert_idx_a, const int& vert_idx_b)
+	{
+		const float3 &vert_a = verts_[vert_idx_a];
+		const float3 &vert_b = verts_[vert_idx_b];
+		if(vert_a == vert_b) {
+			/* Special case for doubles, so we ensure ordering. */
+			return vert_idx_a > vert_idx_b;
+		}
+		const float x1 = vert_a.x + vert_a.y + vert_a.z;
+		const float x2 = vert_b.x + vert_b.y + vert_b.z;
+		return x1 < x2;
+	}
+
+protected:
+	const array<float3>& verts_;
+};
+
 static void attr_create_pointiness(Scene *scene,
                                    Mesh *mesh,
                                    BL::Mesh& b_mesh,
                                    bool subdivision)
 {
-	if(mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
-		const int numverts = b_mesh.vertices.length();
-		AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
-		Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
-		float *data = attr->data_float();
-		int *counter = new int[numverts];
-		float *raw_data = new float[numverts];
-		float3 *edge_accum = new float3[numverts];
-
-		/* Calculate pointiness using single ring neighborhood. */
-		memset(counter, 0, sizeof(int) * numverts);
-		memset(raw_data, 0, sizeof(float) * numverts);
-		memset(edge_accum, 0, sizeof(float3) * numverts);
-		BL::Mesh::edges_iterator e;
-		int i = 0;
-		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
-			int v0 = b_mesh.edges[i].vertices()[0],
-			    v1 = b_mesh.edges[i].vertices()[1];
-			float3 co0 = get_float3(b_mesh.vertices[v0].co()),
-			       co1 = get_float3(b_mesh.vertices[v1].co());
-			float3 edge = normalize(co1 - co0);
-			edge_accum[v0] += edge;
-			edge_accum[v1] += -edge;
-			++counter[v0];
-			++counter[v1];
-		}
-		i = 0;
-		BL::Mesh::vertices_iterator v;
-		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++i) {
-			if(counter[i] > 0) {
-				float3 normal = get_float3(b_mesh.vertices[i].normal());
-				float angle = safe_acosf(dot(normal, edge_accum[i] / counter[i]));
-				raw_data[i] = angle * M_1_PI_F;
+	if(!mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
+		return;
+	}
+	const int num_verts = b_mesh.vertices.length();
+	/* STEP 1: Find out duplicated vertices and point duplicates to a single
+	 *         original vertex.
+	 */
+	vector<int> sorted_vert_indeices(num_verts);
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		sorted_vert_indeices[vert_index] = vert_index;
+	}
+	VertexAverageComparator compare(mesh->verts);
+	sort(sorted_vert_indeices.begin(), sorted_vert_indeices.end(), compare);
+	/* This array stores index of the original vertex for the given vertex
+	 * index.
+	 */
+	vector<int> vert_orig_index(num_verts);
+	for(int sorted_vert_index = 0;
+	    sorted_vert_index < num_verts;
+	    ++sorted_vert_index)
+	{
+		const int vert_index = sorted_vert_indeices[sorted_vert_index];
+		const float3 &vert_co = mesh->verts[vert_index];
+		bool found = false;
+		for(int other_sorted_vert_index = sorted_vert_index + 1;
+		    other_sorted_vert_index < num_verts;
+		    ++other_sorted_vert_index)
+		{
+			const int other_vert_index =
+			        sorted_vert_indeices[other_sorted_vert_index];
+			const float3 &other_vert_co = mesh->verts[other_vert_index];
+			/* We are too far away now, we wouldn't have duplicate. */
+			if((other_vert_co.x + other_vert_co.y + other_vert_co.z) -
+			   (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON)
+			{
+				break;
 			}
-			else {
-				raw_data[i] = 0.0f;
+			/* Found duplicate. */
+			if(len_squared(other_vert_co - vert_co) < FLT_EPSILON) {
+				found = true;
+				vert_orig_index[vert_index] = other_vert_index;
+				break;
 			}
 		}
-
-		/* Blur vertices to approximate 2 ring neighborhood. */
-		memset(counter, 0, sizeof(int) * numverts);
-		memcpy(data, raw_data, sizeof(float) * numverts);
-		i = 0;
-		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
-			int v0 = b_mesh.edges[i].vertices()[0],
-			    v1 = b_mesh.edges[i].vertices()[1];
-			data[v0] += raw_data[v1];
-			data[v1] += raw_data[v0];
-			++counter[v0];
-			++counter[v1];
+		if(!found) {
+			vert_orig_index[vert_index] = vert_index;
+		}
+	}
+	/* Make sure we always points to the very first orig vertex. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		int orig_index = vert_orig_index[vert_index];
+		while(orig_index != vert_orig_index[orig_index]) {
+			orig_index = vert_orig_index[orig_index];
 		}
-		for(i = 0; i < numverts; ++i) {
-			data[i] /= counter[i] + 1;
+		vert_orig_index[vert_index] = orig_index;
+	}
+	sorted_vert_indeices.free_memory();
+	/* STEP 2: Calculate vertex normals taking into account their possible
+	 *         duplicates which gets "welded" together.
+	 */
+	vector<float3> vert_normal(num_verts, make_float3(0.0f, 0.0f, 0.0f));
+	/* First we accumulate all vertex normals in the original index. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const float3 normal = get_float3(b_mesh.vertices[vert_index].normal());
+		const int orig_index = vert_orig_index[vert_index];
+		vert_normal[orig_index] += normal;
+	}
+	/* Then we normalize the accumulated result and flush it to all duplicates
+	 * as well.
+	 */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		vert_normal[vert_index] = normalize(vert_normal[orig_index]);
+	}
+	/* STEP 3: Calculate pointiness using single ring neighborhood. */
+	vector<int> counter(num_verts, 0);
+	vector<float> raw_data(num_verts, 0.0f);
+	vector<float3> edge_accum(num_verts, make_float3(0.0f, 0.0f, 0.0f));
+	BL::Mesh::edges_iterator e;
+	EdgeMap visited_edges;
+	int edge_index = 0;
+	memset(&counter[0], 0, sizeof(int) * counter.size());
+	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) {
+		const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]],
+		          v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]];
+		if(visited_edges.exists(v0, v1)) {
+			continue;
+		}
+		visited_edges.insert(v0, v1);
+		float3 co0 = get_float3(b_mesh.vertices[v0].co()),
+		       co1 = get_float3(b_mesh.vertices[v1].co());
+		float3 edge = normalize(co1 - co0);
+		edge_accum[v0] += edge;
+		edge_accum[v1] += -edge;
+		++counter[v0];
+		++counter[v1];
+	}
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		if(orig_index != vert_index) {
+			/* Skip duplicates, they'll be overwritten later on. */
+			continue;
+		}
+		if(counter[vert_index] > 0) {
+			const float3 normal = vert_normal[vert_index];
+			const float angle =
+			        safe_acosf(dot(normal,
+			                       edge_accum[vert_index] / counter[vert_index]));
+			raw_data[vert_index] = angle * M_1_PI_F;
+		}
+		else {
+			raw_data[vert_index] = 0.0f;
 		}
-
-		delete [] counter;
-		delete [] raw_data;
-		delete [] edge_accum;
+	}
+	/* STEP 3: Blur vertices to approximate 2 ring neighborhood. */
+	AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
+	Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
+	float *data = attr->data_float();
+	memcpy(data, &raw_data[0], sizeof(float) * raw_data.size());
+	memset(&counter[0], 0, sizeof(int) * counter.size());
+	edge_index = 0;
+	visited_edges.clear();
+	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) {
+		const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]],
+		          v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]];
+		if(visited_edges.exists(v0, v1)) {
+			continue;
+		}
+		visited_edges.insert(v0, v1);
+		data[v0] += raw_data[v1];
+		data[v1] += raw_data[v0];
+		++counter[v0];
+		++counter[v1];
+	}
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		data[vert_index] /= counter[vert_index] + 1;
+	}
+	/* STEP 4: Copy attribute to the duplicated vertices. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		data[vert_index] = data[orig_index];
 	}
 }
 
@@ -656,9 +765,6 @@ static void create_mesh(Scene *scene,
 			generated[i++] = get_float3(v->undeformed_co())*size - loc;
 	}
 
-	/* Create needed vertex attributes. */
-	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
-
 	/* create faces */
 	vector<int> nverts(numfaces);
 	vector<int> face_flags(numfaces, FACE_FLAG_NONE);
@@ -671,6 +777,15 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
 			bool smooth = f->use_smooth() || use_loop_normals;
 
+			if(use_loop_normals) {
+				BL::Array<float, 12> loop_normals = f->split_normals();
+				for(int i = 0; i < n; i++) {
+					N[vi[i]] = make_float3(loop_normals[i * 3],
+					                       loop_normals[i * 3 + 1],
+					                       loop_normals[i * 3 + 2]);
+				}
+			}
+
 			/* Create triangles.
 			 *
 			 * NOTE: Autosmooth is already taken care about.
@@ -704,7 +819,7 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(p->material_index(), 0, used_shaders.size()-1);
 			bool smooth = p->use_smooth() || use_loop_normals;
 
-			vi.reserve(n);
+			vi.resize(n);
 			for(int i = 0; i < n; i++) {
 				/* NOTE: Autosmooth is already taken care about. */
 				vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index();
@@ -718,6 +833,7 @@ static void create_mesh(Scene *scene,
 	/* Create all needed attributes.
 	 * The calculate functions will check whether they're needed or not.
 	 */
+	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
 	attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags, subdivision);
 	attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags, subdivision, subdivide_uvs);
 
@@ -1178,4 +1294,3 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 637cf7abda8..d05699236cc 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -14,24 +14,24 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "integrator.h"
-#include "graph.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "nodes.h"
-#include "particles.h"
-#include "shader.h"
-
-#include "blender_object_cull.h"
-#include "blender_sync.h"
-#include "blender_util.h"
-
-#include "util_foreach.h"
-#include "util_hash.h"
-#include "util_logging.h"
+#include "render/camera.h"
+#include "render/integrator.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/nodes.h"
+#include "render/particles.h"
+#include "render/shader.h"
+
+#include "blender/blender_object_cull.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -343,6 +343,13 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		object_updated = true;
 	}
 
+	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+	bool is_shadow_catcher = get_boolean(cobject, "is_shadow_catcher");
+	if(is_shadow_catcher != object->is_shadow_catcher) {
+		object->is_shadow_catcher = is_shadow_catcher;
+		object_updated = true;
+	}
+
 	/* object sync
 	 * transform comparison should not be needed, but duplis don't work perfect
 	 * in the depsgraph and may not signal changes, so this is a workaround */
diff --git a/intern/cycles/blender/blender_object_cull.cpp b/intern/cycles/blender/blender_object_cull.cpp
index 08918dd1a49..0333c027f70 100644
--- a/intern/cycles/blender/blender_object_cull.cpp
+++ b/intern/cycles/blender/blender_object_cull.cpp
@@ -16,9 +16,9 @@
 
 #include <cstdlib>
 
-#include "camera.h"
+#include "render/camera.h"
 
-#include "blender_object_cull.h"
+#include "blender/blender_object_cull.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_object_cull.h b/intern/cycles/blender/blender_object_cull.h
index b6f0ca5cd31..2147877a860 100644
--- a/intern/cycles/blender/blender_object_cull.h
+++ b/intern/cycles/blender/blender_object_cull.h
@@ -17,8 +17,8 @@
 #ifndef __BLENDER_OBJECT_CULL_H__
 #define __BLENDER_OBJECT_CULL_H__
 
-#include "blender_sync.h"
-#include "util_types.h"
+#include "blender/blender_sync.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp
index dd2900a8d5b..00f8cb3cf1b 100644
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "object.h"
-#include "particles.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/particles.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_foreach.h"
+#include "util/util_foreach.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 438abc49f88..d509e9de981 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -16,21 +16,21 @@
 
 #include <Python.h>
 
-#include "CCL_api.h"
+#include "blender/CCL_api.h"
 
-#include "blender_sync.h"
-#include "blender_session.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_types.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
 
 #ifdef WITH_OSL
-#include "osl.h"
+#include "render/osl.h"
 
 #include <OSL/oslquery.h>
 #include <OSL/oslconfig.h>
@@ -67,8 +67,10 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
 	flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
 	flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
+	flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
 	/* Synchronize CUDA flags. */
 	flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
+	flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
 	/* Synchronize OpenCL kernel type. */
 	switch(get_enum(cscene, "debug_opencl_kernel_type")) {
 		case 0:
@@ -104,6 +106,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	}
 	/* Synchronize other OpenCL flags. */
 	flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
+	flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
 	return flags.opencl.device_type != opencl_device_type ||
 	       flags.opencl.kernel_type != opencl_kernel_type;
 }
@@ -641,7 +644,7 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
 	Py_RETURN_NONE;
 }
 
-static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
+static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
 {
 	int num_resumable_chunks, current_resumable_chunk;
 	if(!PyArg_ParseTuple(args, "ii",
@@ -676,6 +679,53 @@ static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
 	Py_RETURN_NONE;
 }
 
+static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
+{
+	int num_chunks, start_chunk, end_chunk;
+	if(!PyArg_ParseTuple(args, "iii",
+	                     &num_chunks,
+	                     &start_chunk,
+	                     &end_chunk)) {
+		Py_RETURN_NONE;
+	}
+
+	if(num_chunks <= 0) {
+		fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(start_chunk < 1 || start_chunk > num_chunks) {
+		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(end_chunk < 1 || end_chunk > num_chunks) {
+		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(start_chunk > end_chunk) {
+		fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+
+	VLOG(1) << "Initialized resumable render: "
+	        << "num_resumable_chunks=" << num_chunks << ", "
+	        << "start_resumable_chunk=" << start_chunk
+	        << "end_resumable_chunk=" << end_chunk;
+	BlenderSession::num_resumable_chunks = num_chunks;
+	BlenderSession::start_resumable_chunk = start_chunk;
+	BlenderSession::end_resumable_chunk = end_chunk;
+
+	printf("Cycles: Will render chunks %d to %d of %d\n",
+	       start_chunk,
+	       end_chunk,
+	       num_chunks);
+
+	Py_RETURN_NONE;
+}
+
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
 	vector<DeviceInfo>& devices = Device::available_devices();
@@ -715,7 +765,8 @@ static PyMethodDef methods[] = {
 	{"debug_flags_reset", debug_flags_reset_func, METH_NOARGS, ""},
 
 	/* Resumable render */
-	{"set_resumable_chunks", set_resumable_chunks_func, METH_VARARGS, ""},
+	{"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
+	{"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
 
 	/* Compute Device selection */
 	{"get_device_types", get_device_types_func, METH_VARARGS, ""},
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 2f30cbd961f..26f9bccd95d 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -16,36 +16,38 @@
 
 #include <stdlib.h>
 
-#include "background.h"
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "session.h"
-#include "shader.h"
-
-#include "util_color.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_hash.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_time.h"
-
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "render/background.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/shader.h"
+
+#include "util/util_color.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_time.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
 
 CCL_NAMESPACE_BEGIN
 
 bool BlenderSession::headless = false;
 int BlenderSession::num_resumable_chunks = 0;
 int BlenderSession::current_resumable_chunk = 0;
+int BlenderSession::start_resumable_chunk = 0;
+int BlenderSession::end_resumable_chunk = 0;
 
 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
                                BL::UserPreferences& b_userpref,
@@ -68,6 +70,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = true;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
+	last_status_time = 0.0;
 }
 
 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
@@ -93,6 +96,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = false;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
+	last_status_time = 0.0;
 }
 
 BlenderSession::~BlenderSession()
@@ -989,10 +993,14 @@ void BlenderSession::update_status_progress()
 	if(substatus.size() > 0)
 		status += " | " + substatus;
 
-	if(status != last_status) {
+	double current_time = time_dt();
+	/* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date.
+	 * For headless rendering, only report when something significant changes to keep the console output readable. */
+	if(status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
 		b_engine.update_stats("", (timestatus + scene + status).c_str());
 		b_engine.update_memory_stats(mem_used, mem_peak);
 		last_status = status;
+		last_status_time = current_time;
 	}
 	if(progress != last_progress) {
 		b_engine.update_progress(progress);
@@ -1342,9 +1350,21 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 		return;
 	}
 
-	int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
-	int range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
-	int range_num_samples = num_samples_per_chunk;
+	const int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
+
+	int range_start_sample, range_num_samples;
+	if(current_resumable_chunk != 0) {
+		/* Single chunk rendering. */
+		range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
+		range_num_samples = num_samples_per_chunk;
+	}
+	else {
+		/* Ranged-chunks. */
+		const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
+		range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
+		range_num_samples = num_chunks * num_samples_per_chunk;
+	}
+	/* Make sure we don't overshoot. */
 	if(range_start_sample + range_num_samples > num_samples) {
 		range_num_samples = num_samples - range_num_samples;
 	}
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 82fe218b4ce..22b21a18f2e 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -17,12 +17,12 @@
 #ifndef __BLENDER_SESSION_H__
 #define __BLENDER_SESSION_H__
 
-#include "device.h"
-#include "scene.h"
-#include "session.h"
-#include "bake.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/bake.h"
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -113,6 +113,7 @@ public:
 	string last_status;
 	string last_error;
 	float last_progress;
+	double last_status_time;
 
 	int width, height;
 	double start_resize_time;
@@ -137,6 +138,10 @@ public:
 	/* Current resumable chunk index to render. */
 	static int current_resumable_chunk;
 
+	/* Alternative to single-chunk rendering to render a range of chunks. */
+	static int start_resumable_chunk;
+	static int end_resumable_chunk;
+
 protected:
 	void do_write_update_render_result(BL::RenderResult& b_rr,
 	                                   BL::RenderLayer& b_rlay,
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index c9d4236a7f2..1ec82445b20 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "graph.h"
-#include "light.h"
-#include "nodes.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "blender_texture.h"
-#include "blender_sync.h"
-#include "blender_util.h"
-
-#include "util_debug.h"
-#include "util_string.h"
+#include "render/background.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/nodes.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "blender/blender_texture.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_debug.h"
+#include "util/util_string.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -622,7 +623,8 @@ static ShaderNode *add_node(Scene *scene,
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
 			                  b_image.source() == BL::Image::source_MOVIE ||
-			                  b_engine.is_preview();
+			                  (b_engine.is_preview() &&
+			                   b_image.source() != BL::Image::source_SEQUENCE);
 
 			if(is_builtin) {
 				/* for builtin images we're using image datablock name to find an image to
@@ -675,7 +677,8 @@ static ShaderNode *add_node(Scene *scene,
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
 			                  b_image.source() == BL::Image::source_MOVIE ||
-			                  b_engine.is_preview();
+			                  (b_engine.is_preview() &&
+			                   b_image.source() != BL::Image::source_SEQUENCE);
 
 			if(is_builtin) {
 				int scene_frame = b_scene.frame_current();
@@ -1168,6 +1171,13 @@ static void add_nodes(Scene *scene,
 
 /* Sync Materials */
 
+void BlenderSync::sync_materials_simpligy(Shader *shader)
+{
+	ShaderGraph *graph = shader->graph;
+	graph->simplify(scene);
+	shader->tag_update(scene);
+}
+
 void BlenderSync::sync_materials(bool update_all)
 {
 	shader_map.set_default(scene->default_surface);
@@ -1175,6 +1185,8 @@ void BlenderSync::sync_materials(bool update_all)
 	/* material loop */
 	BL::BlendData::materials_iterator b_mat;
 
+	TaskPool pool;
+
 	for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) {
 		Shader *shader;
 
@@ -1210,9 +1222,31 @@ void BlenderSync::sync_materials(bool update_all)
 			shader->displacement_method = (experimental) ? get_displacement_method(cmat) : DISPLACE_BUMP;
 
 			shader->set_graph(graph);
-			shader->tag_update(scene);
+
+			/* By simplifying the shader graph as soon as possible, some
+			 * redundant shader nodes might be removed which prevents loading
+			 * unnecessary attributes later.
+			 *
+			 * However, since graph simplification also accounts for e.g. mix
+			 * weight, this would cause frequent expensive resyncs in interactive
+			 * sessions, so for those sessions optimization is only performed
+			 * right before compiling.
+			 */
+			if(!preview) {
+				pool.push(function_bind(&BlenderSync::sync_materials_simpligy,
+				                        this,
+				                        shader));
+			}
+			else {
+				/* NOTE: Update tagging can access links which are being
+				 * optimized out.
+				 */
+				shader->tag_update(scene);
+			}
 		}
 	}
+
+	pool.wait_work();
 }
 
 /* Sync World */
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index d8043105cd8..3b071bf0e7d 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -14,29 +14,29 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "camera.h"
-#include "film.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
-#include "curves.h"
-
-#include "device.h"
-
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_opengl.h"
-#include "util_hash.h"
+#include "render/background.h"
+#include "render/camera.h"
+#include "render/film.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/curves.h"
+
+#include "device/device.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_opengl.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 6984cbda259..62e2f8f563a 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -22,15 +22,15 @@
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
 
-#include "blender_util.h"
+#include "blender/blender_util.h"
 
-#include "scene.h"
-#include "session.h"
+#include "render/scene.h"
+#include "render/session.h"
 
-#include "util_map.h"
-#include "util_set.h"
-#include "util_transform.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -96,6 +96,7 @@ public:
 private:
 	/* sync */
 	void sync_lamps(bool update_all);
+	void sync_materials_simpligy(Shader *shader);
 	void sync_materials(bool update_all);
 	void sync_objects(BL::SpaceView3D& b_v3d, float motion_time = 0.0f);
 	void sync_motion(BL::RenderSettings& b_render,
diff --git a/intern/cycles/blender/blender_texture.cpp b/intern/cycles/blender/blender_texture.cpp
index 3807e683c7c..b2e27b76189 100644
--- a/intern/cycles/blender/blender_texture.cpp
+++ b/intern/cycles/blender/blender_texture.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "blender_texture.h"
+#include "blender/blender_texture.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_texture.h b/intern/cycles/blender/blender_texture.h
index ad96f9db8ed..734231a85ec 100644
--- a/intern/cycles/blender/blender_texture.h
+++ b/intern/cycles/blender/blender_texture.h
@@ -18,7 +18,7 @@
 #define __BLENDER_TEXTURE_H__
 
 #include <stdlib.h>
-#include "blender_sync.h"
+#include "blender/blender_sync.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index b67834cdea3..4d575330520 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -17,14 +17,15 @@
 #ifndef __BLENDER_UTIL_H__
 #define __BLENDER_UTIL_H__
 
-#include "mesh.h"
+#include "render/mesh.h"
 
-#include "util_map.h"
-#include "util_path.h"
-#include "util_set.h"
-#include "util_transform.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_algorithm.h"
+#include "util/util_map.h"
+#include "util/util_path.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 /* Hacks to hook into Blender API
  * todo: clean this up ... */
@@ -78,7 +79,7 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
 				me.calc_normals_split();
 			}
 			else {
-				me.split_faces();
+				me.split_faces(false);
 			}
 		}
 		if(subdivision_type == Mesh::SUBDIVISION_NONE) {
@@ -786,6 +787,35 @@ struct ParticleSystemKey {
 	}
 };
 
+class EdgeMap {
+public:
+	EdgeMap() {
+	}
+
+	void clear() {
+		edges_.clear();
+	}
+
+	void insert(int v0, int v1) {
+		get_sorted_verts(v0, v1);
+		edges_.insert(std::pair<int, int>(v0, v1));
+	}
+
+	bool exists(int v0, int v1) {
+		get_sorted_verts(v0, v1);
+		return edges_.find(std::pair<int, int>(v0, v1)) != edges_.end();
+	}
+
+protected:
+	void get_sorted_verts(int& v0, int& v1) {
+		if(v0 > v1) {
+			swap(v0, v1);
+		}
+	}
+
+	set< std::pair<int, int> > edges_;
+};
+
 CCL_NAMESPACE_END
 
 #endif /* __BLENDER_UTIL_H__ */
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 92e48f0d87f..4701d75350a 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../util
-	../device
+	..
 )
 
 set(INC_SYS
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 874a4246d1d..58348d16746 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -15,25 +15,25 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "curves.h"
-
-#include "bvh.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
-#include "bvh_params.h"
-#include "bvh_unaligned.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_math.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/curves.h"
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_params.h"
+#include "bvh/bvh_unaligned.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -67,7 +67,7 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 	if(params.use_qbvh)
 		return new QBVH(params, objects);
 	else
-		return new RegularBVH(params, objects);
+		return new BinaryBVH(params, objects);
 }
 
 /* Building */
@@ -81,6 +81,7 @@ void BVH::build(Progress& progress)
 	                   pack.prim_type,
 	                   pack.prim_index,
 	                   pack.prim_object,
+	                   pack.prim_time,
 	                   params,
 	                   progress);
 	BVHNode *root = bvh_build.run();
@@ -256,6 +257,10 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	pack.leaf_nodes.resize(leaf_nodes_size);
 	pack.object_node.resize(objects.size());
 
+	if(params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0) {
+		pack.prim_time.resize(prim_index_size);
+	}
+
 	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
 	int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL;
 	int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL;
@@ -264,6 +269,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL;
 	int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL;
 	int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL;
+	float2 *pack_prim_time = (pack.prim_time.size())? &pack.prim_time[0]: NULL;
 
 	/* merge */
 	foreach(Object *ob, objects) {
@@ -309,6 +315,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 			int *bvh_prim_type = &bvh->pack.prim_type[0];
 			uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0];
 			uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0];
+			float2 *bvh_prim_time = bvh->pack.prim_time.size()? &bvh->pack.prim_time[0]: NULL;
 
 			for(size_t i = 0; i < bvh_prim_index_size; i++) {
 				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
@@ -324,6 +331,9 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 				pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
 				pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
 				pack_prim_object[pack_prim_index_offset] = 0;  // unused for instances
+				if(bvh_prim_time != NULL) {
+					pack_prim_time[pack_prim_index_offset] = bvh_prim_time[i];
+				}
 				pack_prim_index_offset++;
 			}
 		}
@@ -414,64 +424,64 @@ static bool node_bvh_is_unaligned(const BVHNode *node)
 {
 	const BVHNode *node0 = node->get_child(0),
 	              *node1 = node->get_child(1);
-	return node0->is_unaligned() || node1->is_unaligned();
+	return node0->is_unaligned || node1->is_unaligned;
 }
 
-RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_)
+BinaryBVH::BinaryBVH(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
 }
 
-void RegularBVH::pack_leaf(const BVHStackEntry& e,
-                           const LeafNode *leaf)
+void BinaryBVH::pack_leaf(const BVHStackEntry& e,
+                          const LeafNode *leaf)
 {
 	assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
 	float4 data[BVH_NODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
 		/* object */
-		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].x = __int_as_float(~(leaf->lo));
 		data[0].y = __int_as_float(0);
 	}
 	else {
 		/* triangle */
-		data[0].x = __int_as_float(leaf->m_lo);
-		data[0].y = __int_as_float(leaf->m_hi);
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
 	}
-	data[0].z = __uint_as_float(leaf->m_visibility);
+	data[0].z = __uint_as_float(leaf->visibility);
 	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
 	}
 
 	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
 }
 
-void RegularBVH::pack_inner(const BVHStackEntry& e,
-                            const BVHStackEntry& e0,
-                            const BVHStackEntry& e1)
+void BinaryBVH::pack_inner(const BVHStackEntry& e,
+                           const BVHStackEntry& e0,
+                           const BVHStackEntry& e1)
 {
-	if(e0.node->is_unaligned() || e1.node->is_unaligned()) {
+	if(e0.node->is_unaligned || e1.node->is_unaligned) {
 		pack_unaligned_inner(e, e0, e1);
 	} else {
 		pack_aligned_inner(e, e0, e1);
 	}
 }
 
-void RegularBVH::pack_aligned_inner(const BVHStackEntry& e,
-                                    const BVHStackEntry& e0,
-                                    const BVHStackEntry& e1)
+void BinaryBVH::pack_aligned_inner(const BVHStackEntry& e,
+                                   const BVHStackEntry& e0,
+                                   const BVHStackEntry& e1)
 {
 	pack_aligned_node(e.idx,
-	                  e0.node->m_bounds, e1.node->m_bounds,
+	                  e0.node->bounds, e1.node->bounds,
 	                  e0.encodeIdx(), e1.encodeIdx(),
-	                  e0.node->m_visibility, e1.node->m_visibility);
+	                  e0.node->visibility, e1.node->visibility);
 }
 
-void RegularBVH::pack_aligned_node(int idx,
-                                   const BoundBox& b0,
-                                   const BoundBox& b1,
-                                   int c0, int c1,
-                                   uint visibility0, uint visibility1)
+void BinaryBVH::pack_aligned_node(int idx,
+                                  const BoundBox& b0,
+                                  const BoundBox& b1,
+                                  int c0, int c1,
+                                  uint visibility0, uint visibility1)
 {
 	assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
 	assert(c0 < 0 || c0 < pack.nodes.size());
@@ -498,26 +508,26 @@ void RegularBVH::pack_aligned_node(int idx,
 	memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
 }
 
-void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
-                                      const BVHStackEntry& e0,
-                                      const BVHStackEntry& e1)
+void BinaryBVH::pack_unaligned_inner(const BVHStackEntry& e,
+                                     const BVHStackEntry& e0,
+                                     const BVHStackEntry& e1)
 {
 	pack_unaligned_node(e.idx,
 	                    e0.node->get_aligned_space(),
 	                    e1.node->get_aligned_space(),
-	                    e0.node->m_bounds,
-	                    e1.node->m_bounds,
+	                    e0.node->bounds,
+	                    e1.node->bounds,
 	                    e0.encodeIdx(), e1.encodeIdx(),
-	                    e0.node->m_visibility, e1.node->m_visibility);
+	                    e0.node->visibility, e1.node->visibility);
 }
 
-void RegularBVH::pack_unaligned_node(int idx,
-                                     const Transform& aligned_space0,
-                                     const Transform& aligned_space1,
-                                     const BoundBox& bounds0,
-                                     const BoundBox& bounds1,
-                                     int c0, int c1,
-                                     uint visibility0, uint visibility1)
+void BinaryBVH::pack_unaligned_node(int idx,
+                                    const Transform& aligned_space0,
+                                    const Transform& aligned_space1,
+                                    const BoundBox& bounds0,
+                                    const BoundBox& bounds1,
+                                    int c0, int c1,
+                                    uint visibility0, uint visibility1)
 {
 	assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
 	assert(c0 < 0 || c0 < pack.nodes.size());
@@ -543,7 +553,7 @@ void RegularBVH::pack_unaligned_node(int idx,
 	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
 }
 
-void RegularBVH::pack_nodes(const BVHNode *root)
+void BinaryBVH::pack_nodes(const BVHNode *root)
 {
 	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
 	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
@@ -620,7 +630,7 @@ void RegularBVH::pack_nodes(const BVHNode *root)
 	pack.root_index = (root->is_leaf())? -1: 0;
 }
 
-void RegularBVH::refit_nodes()
+void BinaryBVH::refit_nodes()
 {
 	assert(!params.top_level);
 
@@ -629,7 +639,7 @@ void RegularBVH::refit_nodes()
 	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
 }
 
-void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+void BinaryBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 {
 	if(leaf) {
 		assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
@@ -759,18 +769,18 @@ static bool node_qbvh_is_unaligned(const BVHNode *node)
 	              *node1 = node->get_child(1);
 	bool has_unaligned = false;
 	if(node0->is_leaf()) {
-		has_unaligned |= node0->is_unaligned();
+		has_unaligned |= node0->is_unaligned;
 	}
 	else {
-		has_unaligned |= node0->get_child(0)->is_unaligned();
-		has_unaligned |= node0->get_child(1)->is_unaligned();
+		has_unaligned |= node0->get_child(0)->is_unaligned;
+		has_unaligned |= node0->get_child(1)->is_unaligned;
 	}
 	if(node1->is_leaf()) {
-		has_unaligned |= node1->is_unaligned();
+		has_unaligned |= node1->is_unaligned;
 	}
 	else {
-		has_unaligned |= node1->get_child(0)->is_unaligned();
-		has_unaligned |= node1->get_child(1)->is_unaligned();
+		has_unaligned |= node1->get_child(0)->is_unaligned;
+		has_unaligned |= node1->get_child(1)->is_unaligned;
 	}
 	return has_unaligned;
 }
@@ -785,19 +795,19 @@ void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 {
 	float4 data[BVH_QNODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
 		/* object */
-		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].x = __int_as_float(~(leaf->lo));
 		data[0].y = __int_as_float(0);
 	}
 	else {
 		/* triangle */
-		data[0].x = __int_as_float(leaf->m_lo);
-		data[0].y = __int_as_float(leaf->m_hi);
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
 	}
-	data[0].z = __uint_as_float(leaf->m_visibility);
+	data[0].z = __uint_as_float(leaf->visibility);
 	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
 	}
 
 	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
@@ -813,7 +823,7 @@ void QBVH::pack_inner(const BVHStackEntry& e,
 	 */
 	if(params.use_unaligned_nodes) {
 		for(int i = 0; i < num; i++) {
-			if(en[i].node->is_unaligned()) {
+			if(en[i].node->is_unaligned) {
 				has_unaligned = true;
 				break;
 			}
@@ -838,15 +848,15 @@ void QBVH::pack_aligned_inner(const BVHStackEntry& e,
 	BoundBox bounds[4];
 	int child[4];
 	for(int i = 0; i < num; ++i) {
-		bounds[i] = en[i].node->m_bounds;
+		bounds[i] = en[i].node->bounds;
 		child[i] = en[i].encodeIdx();
 	}
 	pack_aligned_node(e.idx,
 	                  bounds,
 	                  child,
-	                  e.node->m_visibility,
-	                  e.node->m_time_from,
-	                  e.node->m_time_to,
+	                  e.node->visibility,
+	                  e.node->time_from,
+	                  e.node->time_to,
 	                  num);
 }
 
@@ -907,16 +917,16 @@ void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
 	int child[4];
 	for(int i = 0; i < num; ++i) {
 		aligned_space[i] = en[i].node->get_aligned_space();
-		bounds[i] = en[i].node->m_bounds;
+		bounds[i] = en[i].node->bounds;
 		child[i] = en[i].encodeIdx();
 	}
 	pack_unaligned_node(e.idx,
 	                    aligned_space,
 	                    bounds,
 	                    child,
-	                    e.node->m_visibility,
-	                    e.node->m_time_from,
-	                    e.node->m_time_to,
+	                    e.node->visibility,
+	                    e.node->time_from,
+	                    e.node->time_to,
 	                    num);
 }
 
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 35f4d305883..60bc62ee6e4 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -18,10 +18,10 @@
 #ifndef __BVH_H__
 #define __BVH_H__
 
-#include "bvh_params.h"
+#include "bvh/bvh_params.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -68,6 +68,8 @@ struct PackedBVH {
 	array<int> prim_index;
 	/* mapping from BVH primitive index, to the object id of that primitive. */
 	array<int> prim_object;
+	/* Time range of BVH primitive. */
+	array<float2> prim_time;
 
 	/* index of the root node. */
 	int root_index;
@@ -108,15 +110,15 @@ protected:
 	virtual void refit_nodes() = 0;
 };
 
-/* Regular BVH
+/* Binary BVH
  *
  * Typical BVH with each node having two children. */
 
-class RegularBVH : public BVH {
+class BinaryBVH : public BVH {
 protected:
 	/* constructor */
 	friend class BVH;
-	RegularBVH(const BVHParams& params, const vector<Object*>& objects);
+	BinaryBVH(const BVHParams& params, const vector<Object*>& objects);
 
 	/* pack */
 	void pack_nodes(const BVHNode *root);
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index 5ddd7349f7b..3226008f511 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -19,11 +19,11 @@
 
 #include <stdlib.h>
 
-#include "bvh_binning.h"
+#include "bvh/bvh_binning.h"
 
-#include "util_algorithm.h"
-#include "util_boundbox.h"
-#include "util_types.h"
+#include "util/util_algorithm.h"
+#include "util/util_boundbox.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h
index 52955f70151..285f9c56a62 100644
--- a/intern/cycles/bvh/bvh_binning.h
+++ b/intern/cycles/bvh/bvh_binning.h
@@ -18,10 +18,10 @@
 #ifndef __BVH_BINNING_H__
 #define __BVH_BINNING_H__
 
-#include "bvh_params.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh_params.h"
+#include "bvh/bvh_unaligned.h"
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index a2f8b33cb0b..95c71b54da0 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -15,26 +15,26 @@
  * limitations under the License.
  */
 
-#include "bvh_binning.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
-#include "bvh_params.h"
+#include "bvh/bvh_binning.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_params.h"
 #include "bvh_split.h"
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "curves.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_stack_allocator.h"
-#include "util_simd.h"
-#include "util_time.h"
-#include "util_queue.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/curves.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_stack_allocator.h"
+#include "util/util_simd.h"
+#include "util/util_time.h"
+#include "util/util_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -93,12 +93,14 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_,
                    array<int>& prim_type_,
                    array<int>& prim_index_,
                    array<int>& prim_object_,
+                   array<float2>& prim_time_,
                    const BVHParams& params_,
                    Progress& progress_)
  : objects(objects_),
    prim_type(prim_type_),
    prim_index(prim_index_),
    prim_object(prim_object_),
+   prim_time(prim_time_),
    params(params_),
    progress(progress_),
    progress_start_time(0.0),
@@ -465,6 +467,9 @@ BVHNode* BVHBuild::run()
 	}
 	spatial_free_index = 0;
 
+	need_prim_time = params.num_motion_curve_steps > 0 ||
+	                 params.num_motion_triangle_steps > 0;
+
 	/* init progress updates */
 	double build_start_time;
 	build_start_time = progress_start_time = time_dt();
@@ -475,6 +480,12 @@ BVHNode* BVHBuild::run()
 	prim_type.resize(references.size());
 	prim_index.resize(references.size());
 	prim_object.resize(references.size());
+	if(need_prim_time) {
+		prim_time.resize(references.size());
+	}
+	else {
+		prim_time.resize(0);
+	}
 
 	/* build recursively */
 	BVHNode *rootnode;
@@ -849,11 +860,14 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		prim_type[start] = ref->prim_type();
 		prim_index[start] = ref->prim_index();
 		prim_object[start] = ref->prim_object();
+		if(need_prim_time) {
+			prim_time[start] = make_float2(ref->time_from(), ref->time_to());
+		}
 
 		uint visibility = objects[ref->prim_object()]->visibility;
 		BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
-		leaf_node->m_time_from = ref->time_from();
-		leaf_node->m_time_to = ref->time_to();
+		leaf_node->time_from = ref->time_from();
+		leaf_node->time_to = ref->time_to();
 		return leaf_node;
 	}
 	else {
@@ -862,12 +876,12 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid);
 
 		BoundBox bounds = BoundBox::empty;
-		bounds.grow(leaf0->m_bounds);
-		bounds.grow(leaf1->m_bounds);
+		bounds.grow(leaf0->bounds);
+		bounds.grow(leaf1->bounds);
 
 		BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1);
-		inner_node->m_time_from = min(leaf0->m_time_from, leaf1->m_time_from);
-		inner_node->m_time_to = max(leaf0->m_time_to, leaf1->m_time_to);
+		inner_node->time_from = min(leaf0->time_from, leaf1->time_from);
+		inner_node->time_to = max(leaf0->time_to, leaf1->time_to);
 		return inner_node;
 	}
 }
@@ -891,11 +905,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 *    can not control.
 	 */
 	typedef StackAllocator<256, int> LeafStackAllocator;
+	typedef StackAllocator<256, float2> LeafTimeStackAllocator;
 	typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
 
 	vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
+	vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
 	vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
 
 	/* TODO(sergey): In theory we should be able to store references. */
@@ -918,6 +934,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			p_type[type_index].push_back(ref.prim_type());
 			p_index[type_index].push_back(ref.prim_index());
 			p_object[type_index].push_back(ref.prim_object());
+			p_time[type_index].push_back(make_float2(ref.time_from(),
+			                                         ref.time_to()));
 
 			bounds[type_index].grow(ref.bounds());
 			visibility[type_index] |= objects[ref.prim_object()]->visibility;
@@ -947,9 +965,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> local_prim_type,
 	                                local_prim_index,
 	                                local_prim_object;
+	vector<float2, LeafTimeStackAllocator> local_prim_time;
 	local_prim_type.resize(num_new_prims);
 	local_prim_index.resize(num_new_prims);
 	local_prim_object.resize(num_new_prims);
+	if(need_prim_time) {
+		local_prim_time.resize(num_new_prims);
+	}
 	for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
 		int num = (int)p_type[i].size();
 		if(num != 0) {
@@ -962,6 +984,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				local_prim_type[index] = p_type[i][j];
 				local_prim_index[index] = p_index[i][j];
 				local_prim_object[index] = p_object[i][j];
+				if(need_prim_time) {
+					local_prim_time[index] = p_time[i][j];
+				}
 				if(params.use_unaligned_nodes && !alignment_found) {
 					alignment_found =
 						unaligned_heuristic.compute_aligned_space(p_ref[i][j],
@@ -979,19 +1004,19 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 					time_from = min(time_from, ref.time_from());
 					time_to = max(time_to, ref.time_to());
 				}
-				leaf_node->m_time_from = time_from;
-				leaf_node->m_time_to = time_to;
+				leaf_node->time_from = time_from;
+				leaf_node->time_to = time_to;
 			}
 			if(alignment_found) {
 				/* Need to recalculate leaf bounds with new alignment. */
-				leaf_node->m_bounds = BoundBox::empty;
+				leaf_node->bounds = BoundBox::empty;
 				for(int j = 0; j < num; ++j) {
 					const BVHReference &ref = p_ref[i][j];
 					BoundBox ref_bounds =
 					        unaligned_heuristic.compute_aligned_prim_boundbox(
 					                ref,
 					                aligned_space);
-					leaf_node->m_bounds.grow(ref_bounds);
+					leaf_node->bounds.grow(ref_bounds);
 				}
 				/* Set alignment space. */
 				leaf_node->set_aligned_space(aligned_space);
@@ -1028,11 +1053,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				prim_type.reserve(reserve);
 				prim_index.reserve(reserve);
 				prim_object.reserve(reserve);
+				if(need_prim_time) {
+					prim_time.reserve(reserve);
+				}
 			}
 
 			prim_type.resize(range_end);
 			prim_index.resize(range_end);
 			prim_object.resize(range_end);
+			if(need_prim_time) {
+				prim_time.resize(range_end);
+			}
 		}
 		spatial_spin_lock.unlock();
 
@@ -1041,6 +1072,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
+			if(need_prim_time) {
+				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
+			}
 		}
 	}
 	else {
@@ -1053,6 +1087,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
+			if(need_prim_time) {
+				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
+			}
 		}
 	}
 
@@ -1062,8 +1099,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 */
 	for(int i = 0; i < num_leaves; ++i) {
 		LeafNode *leaf = (LeafNode *)leaves[i];
-		leaf->m_lo += start_index;
-		leaf->m_hi += start_index;
+		leaf->lo += start_index;
+		leaf->hi += start_index;
 	}
 
 	/* Create leaf node for object. */
@@ -1092,17 +1129,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		return new InnerNode(range.bounds(), leaves[0], leaves[1]);
 	}
 	else if(num_leaves == 3) {
-		BoundBox inner_bounds = merge(leaves[1]->m_bounds, leaves[2]->m_bounds);
+		BoundBox inner_bounds = merge(leaves[1]->bounds, leaves[2]->bounds);
 		BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
 		return new InnerNode(range.bounds(), leaves[0], inner);
 	} else {
 		/* Should be doing more branches if more primitive types added. */
 		assert(num_leaves <= 5);
-		BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds);
-		BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds);
+		BoundBox inner_bounds_a = merge(leaves[0]->bounds, leaves[1]->bounds);
+		BoundBox inner_bounds_b = merge(leaves[2]->bounds, leaves[3]->bounds);
 		BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]);
 		BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]);
-		BoundBox inner_bounds_c = merge(inner_a->m_bounds, inner_b->m_bounds);
+		BoundBox inner_bounds_c = merge(inner_a->bounds, inner_b->bounds);
 		BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b);
 		if(num_leaves == 5) {
 			return new InnerNode(range.bounds(), inner_c, leaves[4]);
@@ -1137,8 +1174,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 		rotate(parent->children[c], max_depth-1);
 
 	/* compute current area of all children */
-	BoundBox bounds0 = parent->children[0]->m_bounds;
-	BoundBox bounds1 = parent->children[1]->m_bounds;
+	BoundBox bounds0 = parent->children[0]->bounds;
+	BoundBox bounds1 = parent->children[1]->bounds;
 
 	float area0 = bounds0.half_area();
 	float area1 = bounds1.half_area();
@@ -1158,8 +1195,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 		BoundBox& other = (c == 0)? bounds1: bounds0;
 
 		/* transpose child bounds */
-		BoundBox target0 = child->children[0]->m_bounds;
-		BoundBox target1 = child->children[1]->m_bounds;
+		BoundBox target0 = child->children[0]->bounds;
+		BoundBox target1 = child->children[1]->bounds;
 
 		/* compute cost for both possible swaps */
 		float cost0 = merge(other, target1).half_area() - child_area[c];
@@ -1191,7 +1228,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 	InnerNode *child = (InnerNode*)parent->children[best_child];
 
 	swap(parent->children[best_other], child->children[best_target]);
-	child->m_bounds = merge(child->children[0]->m_bounds, child->children[1]->m_bounds);
+	child->bounds = merge(child->children[0]->bounds, child->children[1]->bounds);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index ee3cde66a2f..5733708050d 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -20,13 +20,13 @@
 
 #include <float.h>
 
-#include "bvh.h"
-#include "bvh_binning.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_binning.h"
+#include "bvh/bvh_unaligned.h"
 
-#include "util_boundbox.h"
-#include "util_task.h"
-#include "util_vector.h"
+#include "util/util_boundbox.h"
+#include "util/util_task.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,6 +48,7 @@ public:
 	         array<int>& prim_type,
 	         array<int>& prim_index,
 	         array<int>& prim_object,
+	         array<float2>& prim_time,
 	         const BVHParams& params,
 	         Progress& progress);
 	~BVHBuild();
@@ -112,6 +113,9 @@ protected:
 	array<int>& prim_type;
 	array<int>& prim_index;
 	array<int>& prim_object;
+	array<float2>& prim_time;
+
+	bool need_prim_time;
 
 	/* Build parameters. */
 	BVHParams params;
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 67580e1bc7b..4f788c66797 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 
-#include "bvh.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
 
-#include "util_debug.h"
-#include "util_vector.h"
+#include "util/util_debug.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,12 +62,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			}
 			return cnt;
 		case BVH_STAT_ALIGNED_COUNT:
-			if(!is_unaligned()) {
+			if(!is_unaligned) {
 				cnt = 1;
 			}
 			break;
 		case BVH_STAT_UNALIGNED_COUNT:
-			if(is_unaligned()) {
+			if(is_unaligned) {
 				cnt = 1;
 			}
 			break;
@@ -75,7 +75,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			if(!is_leaf()) {
 				bool has_unaligned = false;
 				for(int j = 0; j < num_children(); j++) {
-					has_unaligned |= get_child(j)->is_unaligned();
+					has_unaligned |= get_child(j)->is_unaligned;
 				}
 				cnt += has_unaligned? 0: 1;
 			}
@@ -84,7 +84,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			if(!is_leaf()) {
 				bool has_unaligned = false;
 				for(int j = 0; j < num_children(); j++) {
-					has_unaligned |= get_child(j)->is_unaligned();
+					has_unaligned |= get_child(j)->is_unaligned;
 				}
 				cnt += has_unaligned? 1: 0;
 			}
@@ -95,12 +95,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				for(int i = 0; i < num_children(); i++) {
 					BVHNode *node = get_child(i);
 					if(node->is_leaf()) {
-						has_unaligned |= node->is_unaligned();
+						has_unaligned |= node->is_unaligned;
 					}
 					else {
 						for(int j = 0; j < node->num_children(); j++) {
 							cnt += node->get_child(j)->getSubtreeSize(stat);
-							has_unaligned |= node->get_child(j)->is_unaligned();
+							has_unaligned |= node->get_child(j)->is_unaligned;
 						}
 					}
 				}
@@ -113,12 +113,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				for(int i = 0; i < num_children(); i++) {
 					BVHNode *node = get_child(i);
 					if(node->is_leaf()) {
-						has_unaligned |= node->is_unaligned();
+						has_unaligned |= node->is_unaligned;
 					}
 					else {
 						for(int j = 0; j < node->num_children(); j++) {
 							cnt += node->get_child(j)->getSubtreeSize(stat);
-							has_unaligned |= node->get_child(j)->is_unaligned();
+							has_unaligned |= node->get_child(j)->is_unaligned;
 						}
 					}
 				}
@@ -126,10 +126,10 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			}
 			return cnt;
 		case BVH_STAT_ALIGNED_LEAF_COUNT:
-			cnt = (is_leaf() && !is_unaligned()) ? 1 : 0;
+			cnt = (is_leaf() && !is_unaligned) ? 1 : 0;
 			break;
 		case BVH_STAT_UNALIGNED_LEAF_COUNT:
-			cnt = (is_leaf() && is_unaligned()) ? 1 : 0;
+			cnt = (is_leaf() && is_unaligned) ? 1 : 0;
 			break;
 		default:
 			assert(0); /* unknown mode */
@@ -157,7 +157,7 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons
 
 	for(int i = 0; i < num_children(); i++) {
 		BVHNode *child = get_child(i);
-		SAH += child->computeSubtreeSAHCost(p, probability * child->m_bounds.safe_area()/m_bounds.safe_area());
+		SAH += child->computeSubtreeSAHCost(p, probability * child->bounds.safe_area()/bounds.safe_area());
 	}
 
 	return SAH;
@@ -165,15 +165,15 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons
 
 uint BVHNode::update_visibility()
 {
-	if(!is_leaf() && m_visibility == 0) {
+	if(!is_leaf() && visibility == 0) {
 		InnerNode *inner = (InnerNode*)this;
 		BVHNode *child0 = inner->children[0];
 		BVHNode *child1 = inner->children[1];
 
-		m_visibility = child0->update_visibility()|child1->update_visibility();
+		visibility = child0->update_visibility()|child1->update_visibility();
 	}
 
-	return m_visibility;
+	return visibility;
 }
 
 void BVHNode::update_time()
@@ -184,8 +184,8 @@ void BVHNode::update_time()
 		BVHNode *child1 = inner->children[1];
 		child0->update_time();
 		child1->update_time();
-		m_time_from = min(child0->m_time_from, child1->m_time_from);
-		m_time_to =  max(child0->m_time_to, child1->m_time_to);
+		time_from = min(child0->time_from, child1->time_from);
+		time_to =  max(child0->time_to, child1->time_to);
 	}
 }
 
@@ -209,7 +209,7 @@ void LeafNode::print(int depth) const
 	for(int i = 0; i < depth; i++)
 		printf("  ");
 	
-	printf("leaf node %d to %d\n", m_lo, m_hi);
+	printf("leaf node %d to %d\n", lo, hi);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 090c426de56..60511b4b012 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -18,9 +18,9 @@
 #ifndef __BVH_NODE_H__
 #define __BVH_NODE_H__
 
-#include "util_boundbox.h"
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -46,16 +46,16 @@ class BVHParams;
 class BVHNode
 {
 public:
-	BVHNode() : m_is_unaligned(false),
-	            m_aligned_space(NULL),
-	            m_time_from(0.0f),
-	            m_time_to(1.0f)
+	BVHNode() : is_unaligned(false),
+	            aligned_space(NULL),
+	            time_from(0.0f),
+	            time_to(1.0f)
 	{
 	}
 
 	virtual ~BVHNode()
 	{
-		delete m_aligned_space;
+		delete aligned_space;
 	}
 
 	virtual bool is_leaf() const = 0;
@@ -63,30 +63,26 @@ public:
 	virtual BVHNode *get_child(int i) const = 0;
 	virtual int num_triangles() const { return 0; }
 	virtual void print(int depth = 0) const = 0;
-	bool is_unaligned() const { return m_is_unaligned; }
 
 	inline void set_aligned_space(const Transform& aligned_space)
 	{
-		m_is_unaligned = true;
-		if(m_aligned_space == NULL) {
-			m_aligned_space = new Transform(aligned_space);
+		is_unaligned = true;
+		if(this->aligned_space == NULL) {
+			this->aligned_space = new Transform(aligned_space);
 		}
 		else {
-			*m_aligned_space = aligned_space;
+			*this->aligned_space = aligned_space;
 		}
 	}
 
 	inline Transform get_aligned_space() const
 	{
-		if(m_aligned_space == NULL) {
+		if(aligned_space == NULL) {
 			return transform_identity();
 		}
-		return *m_aligned_space;
+		return *aligned_space;
 	}
 
-	BoundBox m_bounds;
-	uint m_visibility;
-
 	// Subtree functions
 	int getSubtreeSize(BVH_STAT stat=BVH_STAT_NODE_COUNT) const;
 	float computeSubtreeSAHCost(const BVHParams& p, float probability = 1.0f) const;
@@ -95,13 +91,18 @@ public:
 	uint update_visibility();
 	void update_time();
 
-	bool m_is_unaligned;
+	// Properties.
+	BoundBox bounds;
+	uint visibility;
+
+	bool is_unaligned;
 
-	// TODO(sergey): Can be stored as 3x3 matrix, but better to have some
-	// utilities and type defines in util_transform first.
-	Transform *m_aligned_space;
+	/* TODO(sergey): Can be stored as 3x3 matrix, but better to have some
+	 * utilities and type defines in util_transform first.
+	 */
+	Transform *aligned_space;
 
-	float m_time_from, m_time_to;
+	float time_from, time_to;
 };
 
 class InnerNode : public BVHNode
@@ -111,20 +112,20 @@ public:
 	          BVHNode* child0,
 	          BVHNode* child1)
 	{
-		m_bounds = bounds;
+		this->bounds = bounds;
 		children[0] = child0;
 		children[1] = child1;
 
 		if(child0 && child1)
-			m_visibility = child0->m_visibility|child1->m_visibility;
+			visibility = child0->visibility|child1->visibility;
 		else
-			m_visibility = 0; /* happens on build cancel */
+			visibility = 0; /* happens on build cancel */
 	}
 
 	explicit InnerNode(const BoundBox& bounds)
 	{
-		m_bounds = bounds;
-		m_visibility = 0;
+		this->bounds = bounds;
+		visibility = 0;
 		children[0] = NULL;
 		children[1] = NULL;
 	}
@@ -140,12 +141,12 @@ public:
 class LeafNode : public BVHNode
 {
 public:
-	LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi) 
+	LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi)
+	: lo(lo),
+	  hi(hi)
 	{
-		m_bounds = bounds;
-		m_visibility = visibility;
-		m_lo = lo;
-		m_hi = hi;
+		this->bounds = bounds;
+		this->visibility = visibility;
 	}
 
 	LeafNode(const LeafNode& s)
@@ -157,14 +158,13 @@ public:
 	bool is_leaf() const { return true; }
 	int num_children() const { return 0; }
 	BVHNode *get_child(int) const { return NULL; }
-	int num_triangles() const { return m_hi - m_lo; }
+	int num_triangles() const { return hi - lo; }
 	void print(int depth) const;
 
-	int m_lo;
-	int m_hi;
+	int lo;
+	int hi;
 };
 
 CCL_NAMESPACE_END
 
 #endif /* __BVH_NODE_H__ */
-
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 65f9da1c194..9795a7a4350 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -18,9 +18,9 @@
 #ifndef __BVH_PARAMS_H__
 #define __BVH_PARAMS_H__
 
-#include "util_boundbox.h"
+#include "util/util_boundbox.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -104,6 +104,7 @@ public:
 		primitive_mask = PRIMITIVE_ALL;
 
 		num_motion_curve_steps = 0;
+		num_motion_triangle_steps = 0;
 	}
 
 	/* SAH costs */
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index e5bcf9995bf..d29629c0279 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 
-#include "bvh_build.h"
-#include "bvh_sort.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_sort.h"
 
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_task.h"
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index d0d5fbe5a7a..b10d69a495d 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-#include "bvh_build.h"
-#include "bvh_split.h"
-#include "bvh_sort.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_split.h"
+#include "bvh/bvh_sort.h"
 
-#include "mesh.h"
-#include "object.h"
+#include "render/mesh.h"
+#include "render/object.h"
 
-#include "util_algorithm.h"
+#include "util/util_algorithm.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index dbdb51f1a5b..a874a118b99 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -18,8 +18,8 @@
 #ifndef __BVH_SPLIT_H__
 #define __BVH_SPLIT_H__
 
-#include "bvh_build.h"
-#include "bvh_params.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_params.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
index a876c670914..ef227d20ea9 100644
--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -15,17 +15,17 @@
  */
 
 
-#include "bvh_unaligned.h"
+#include "bvh/bvh_unaligned.h"
 
-#include "mesh.h"
-#include "object.h"
+#include "render/mesh.h"
+#include "render/object.h"
 
-#include "bvh_binning.h"
+#include "bvh/bvh_binning.h"
 #include "bvh_params.h"
 
-#include "util_boundbox.h"
-#include "util_debug.h"
-#include "util_transform.h"
+#include "util/util_boundbox.h"
+#include "util/util_debug.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h
index 4d0872f4a39..f41bae79e2b 100644
--- a/intern/cycles/bvh/bvh_unaligned.h
+++ b/intern/cycles/bvh/bvh_unaligned.h
@@ -17,7 +17,7 @@
 #ifndef __BVH_UNALIGNED_H__
 #define __BVH_UNALIGNED_H__
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 966ff5e52ba..6ef2aa1caad 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../kernel/osl
-	../util
-	../render
+	..
 	../../glew-mx
 )
 
@@ -33,6 +27,7 @@ set(SRC
 	device_cuda.cpp
 	device_multi.cpp
 	device_opencl.cpp
+	device_split_kernel.cpp
 	device_task.cpp
 )
 
@@ -56,6 +51,7 @@ set(SRC_HEADERS
 	device_memory.h
 	device_intern.h
 	device_network.h
+	device_split_kernel.h
 	device_task.h
 )
 
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 31c99f49d6d..968af447e29 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -17,18 +17,18 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "device.h"
-#include "device_intern.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_half.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_time.h"
-#include "util_types.h"
-#include "util_vector.h"
-#include "util_string.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_half.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_time.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,11 +48,11 @@ std::ostream& operator <<(std::ostream &os,
 	os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
 	/* TODO(sergey): Decode bitflag into list of names. */
 	os << "Nodes features: " << requested_features.nodes_features << std::endl;
-	os << "Use hair: "
+	os << "Use Hair: "
 	   << string_from_bool(requested_features.use_hair) << std::endl;
-	os << "Use object motion: "
+	os << "Use Object Motion: "
 	   << string_from_bool(requested_features.use_object_motion) << std::endl;
-	os << "Use camera motion: "
+	os << "Use Camera Motion: "
 	   << string_from_bool(requested_features.use_camera_motion) << std::endl;
 	os << "Use Baking: "
 	   << string_from_bool(requested_features.use_baking) << std::endl;
@@ -80,7 +80,7 @@ Device::~Device()
 
 void Device::pixels_alloc(device_memory& mem)
 {
-	mem_alloc(mem, MEM_READ_WRITE);
+	mem_alloc("pixels", mem, MEM_READ_WRITE);
 }
 
 void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ccee25ae34e..ac06e561795 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -19,15 +19,15 @@
 
 #include <stdlib.h>
 
-#include "device_memory.h"
-#include "device_task.h"
+#include "device/device_memory.h"
+#include "device/device_task.h"
 
-#include "util_list.h"
-#include "util_stats.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_stats.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -121,6 +121,9 @@ public:
 	/* Use Transparent shadows */
 	bool use_transparent;
 
+	/* Use various shadow tricks, such as shadow catcher. */
+	bool use_shadow_tricks;
+
 	DeviceRequestedFeatures()
 	{
 		/* TODO(sergey): Find more meaningful defaults. */
@@ -137,6 +140,7 @@ public:
 		use_integrator_branched = false;
 		use_patch_evaluation = false;
 		use_transparent = false;
+		use_shadow_tricks = false;
 	}
 
 	bool modified(const DeviceRequestedFeatures& requested_features)
@@ -153,7 +157,8 @@ public:
 		         use_volume == requested_features.use_volume &&
 		         use_integrator_branched == requested_features.use_integrator_branched &&
 		         use_patch_evaluation == requested_features.use_patch_evaluation &&
-		         use_transparent == requested_features.use_transparent);
+		         use_transparent == requested_features.use_transparent &&
+		         use_shadow_tricks == requested_features.use_shadow_tricks);
 	}
 
 	/* Convert the requested features structure to a build options,
@@ -194,9 +199,12 @@ public:
 		if(!use_patch_evaluation) {
 			build_options += " -D__NO_PATCH_EVAL__";
 		}
-		if(!use_transparent) {
+		if(!use_transparent && !use_volume) {
 			build_options += " -D__NO_TRANSPARENT__";
 		}
+		if(!use_shadow_tricks) {
+			build_options += " -D__NO_SHADOW_TRICKS__";
+		}
 		return build_options;
 	}
 };
@@ -228,13 +236,21 @@ public:
 	DeviceInfo info;
 	virtual const string& error_message() { return error_msg; }
 	bool have_error() { return !error_message().empty(); }
+	virtual void set_error(const string& error)
+	{
+		if(!have_error()) {
+			error_msg = error;
+		}
+		fprintf(stderr, "%s\n", error.c_str());
+		fflush(stderr);
+	}
 	virtual bool show_samples() const { return false; }
 
 	/* statistics */
 	Stats &stats;
 
 	/* regular memory */
-	virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
+	virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0;
 	virtual void mem_copy_to(device_memory& mem) = 0;
 	virtual void mem_copy_from(device_memory& mem,
 		int y, int w, int h, int elem) = 0;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c8e001ec2fd..2761d9488ca 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -20,36 +20,124 @@
 /* So ImathMath is included before our kernel_cpu_compat. */
 #ifdef WITH_OSL
 /* So no context pollution happens from indirectly included windows.h */
-#  include "util_windows.h"
+#  include "util/util_windows.h"
 #  include <OSL/oslexec.h>
 #endif
 
-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"
 
-#include "kernel.h"
-#include "kernel_compat_cpu.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "kernel/kernel.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data.h"
+#include "kernel/kernel_globals.h"
 
-#include "osl_shader.h"
-#include "osl_globals.h"
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_opengl.h"
-#include "util_progress.h"
-#include "util_system.h"
-#include "util_thread.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
+class CPUDevice;
+
+class CPUSplitKernel : public DeviceSplitKernel {
+	CPUDevice *device;
+public:
+	explicit CPUSplitKernel(CPUDevice *device);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+
 class CPUDevice : public Device
 {
+	static unordered_map<string, void*> kernel_functions;
+
+	static void register_kernel_function(const char* name, void* func)
+	{
+		kernel_functions[name] = func;
+	}
+
+	static const char* get_arch_name()
+	{
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		if(system_cpu_support_avx2()) {
+			return "cpu_avx2";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		if(system_cpu_support_avx()) {
+			return "cpu_avx";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		if(system_cpu_support_sse41()) {
+			return "cpu_sse41";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+		if(system_cpu_support_sse3()) {
+			return "cpu_sse3";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		if(system_cpu_support_sse2()) {
+			return "cpu_sse2";
+		}
+		else
+#endif
+		{
+			return "cpu";
+		}
+	}
+
+	template<typename F>
+	static F get_kernel_function(string name)
+	{
+		name = string("kernel_") + get_arch_name() + "_" + name;
+
+		unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+
+		if(it == kernel_functions.end()) {
+			assert(!"kernel function not found");
+			return NULL;
+		}
+
+		return (F)it->second;
+	}
+
+	friend class CPUSplitKernel;
+
 public:
 	TaskPool task_pool;
 	KernelGlobals kernel_globals;
@@ -57,10 +145,15 @@ public:
 #ifdef WITH_OSL
 	OSLGlobals osl_globals;
 #endif
+
+	bool use_split_kernel;
+
+	DeviceRequestedFeatures requested_features;
 	
 	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
 	: Device(info, stats, background)
 	{
+
 #ifdef WITH_OSL
 		kernel_globals.osl = &osl_globals;
 #endif
@@ -105,6 +198,28 @@ public:
 		{
 			VLOG(1) << "Will be using regular kernels.";
 		}
+
+		use_split_kernel = DebugFlags().cpu.split_kernel;
+		if(use_split_kernel) {
+			VLOG(1) << "Will be using split kernel.";
+		}
+
+		kernel_cpu_register_functions(register_kernel_function);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		kernel_cpu_sse2_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+		kernel_cpu_sse3_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		kernel_cpu_sse41_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		kernel_cpu_avx_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		kernel_cpu_avx2_register_functions(register_kernel_function);
+#endif
 	}
 
 	~CPUDevice()
@@ -117,9 +232,20 @@ public:
 		return (TaskScheduler::num_threads() == 1);
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		mem.device_pointer = mem.data_pointer;
+
+		if(!mem.device_pointer) {
+			mem.device_pointer = (device_ptr)malloc(mem.memory_size());
+		}
+
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
@@ -144,6 +270,10 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
+			if(!mem.data_pointer) {
+				free((void*)mem.device_pointer);
+			}
+
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
@@ -196,8 +326,14 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE)
-			thread_path_trace(*task);
+		if(task->type == DeviceTask::PATH_TRACE) {
+			if(!use_split_kernel) {
+				thread_path_trace(*task);
+			}
+			else {
+				thread_path_trace_split(*task);
+			}
+		}
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
 		else if(task->type == DeviceTask::SHADER)
@@ -258,7 +394,7 @@ public:
 		{
 			path_trace_kernel = kernel_cpu_path_trace;
 		}
-		
+
 		while(task.acquire_tile(this, tile)) {
 			float *render_buffer = (float*)tile.buffer;
 			uint *rng_state = (uint*)tile.rng_state;
@@ -294,6 +430,49 @@ public:
 		thread_kernel_globals_free(&kg);
 	}
 
+	void thread_path_trace_split(DeviceTask& task)
+	{
+		if(task_pool.canceled()) {
+			if(task.need_finish_queue == false)
+				return;
+		}
+
+		RenderTile tile;
+
+		CPUSplitKernel split_kernel(this);
+
+		/* allocate buffer for kernel globals */
+		device_memory kgbuffer;
+		kgbuffer.resize(sizeof(KernelGlobals));
+		mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
+
+		KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
+		*kg = thread_kernel_globals_init();
+
+		requested_features.max_closure = MAX_CLOSURE;
+		if(!split_kernel.load_kernels(requested_features)) {
+			thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+			mem_free(kgbuffer);
+
+			return;
+		}
+
+		while(task.acquire_tile(this, tile)) {
+			device_memory data;
+			split_kernel.path_trace(&task, tile, kgbuffer, data);
+
+			task.release_tile(tile);
+
+			if(task_pool.canceled()) {
+				if(task.need_finish_queue == false)
+					break;
+			}
+		}
+
+		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+		mem_free(kgbuffer);
+	}
+
 	void thread_film_convert(DeviceTask& task)
 	{
 		float sample_scale = 1.0f/(task.sample + 1);
@@ -501,6 +680,10 @@ protected:
 
 	inline void thread_kernel_globals_free(KernelGlobals *kg)
 	{
+		if(kg == NULL) {
+			return;
+		}
+
 		if(kg->transparent_shadow_intersections != NULL) {
 			free(kg->transparent_shadow_intersections);
 		}
@@ -515,8 +698,175 @@ protected:
 		OSLShader::thread_free(kg);
 #endif
 	}
+
+	virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
+		requested_features = requested_features_;
+
+		return true;
+	}
+};
+
+/* split kernel */
+
+class CPUSplitKernelFunction : public SplitKernelFunction {
+public:
+	CPUDevice* device;
+	void (*func)(KernelGlobals *kg, KernelData *data);
+
+	CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
+	~CPUSplitKernelFunction() {}
+
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
+	{
+		if(!func) {
+			return false;
+		}
+
+		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+		for(int y = 0; y < dim.global_size[1]; y++) {
+			for(int x = 0; x < dim.global_size[0]; x++) {
+				kg->global_id = make_int2(x, y);
+
+				func(kg, (KernelData*)data.device_pointer);
+			}
+		}
+
+		return true;
+	}
 };
 
+CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                                    RenderTile& rtile,
+                                                    int num_global_elements,
+                                                    device_memory& kernel_globals,
+                                                    device_memory& data,
+                                                    device_memory& split_data,
+                                                    device_memory& ray_state,
+                                                    device_memory& queue_index,
+                                                    device_memory& use_queues_flags,
+                                                    device_memory& work_pool_wgs)
+{
+	typedef void(*data_init_t)(KernelGlobals *kg,
+	                           ccl_constant KernelData *data,
+	                           ccl_global void *split_data_buffer,
+	                           int num_elements,
+	                           ccl_global char *ray_state,
+	                           ccl_global uint *rng_state,
+	                           int start_sample,
+	                           int end_sample,
+	                           int sx, int sy, int sw, int sh, int offset, int stride,
+	                           ccl_global int *Queue_index,
+	                           int queuesize,
+	                           ccl_global char *use_queues_flag,
+	                           ccl_global unsigned int *work_pool_wgs,
+	                           unsigned int num_samples,
+	                           ccl_global float *buffer);
+
+	data_init_t data_init;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+	if(system_cpu_support_avx2()) {
+		data_init = kernel_cpu_avx2_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+	if(system_cpu_support_avx()) {
+		data_init = kernel_cpu_avx_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+	if(system_cpu_support_sse41()) {
+		data_init = kernel_cpu_sse41_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+	if(system_cpu_support_sse3()) {
+		data_init = kernel_cpu_sse3_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+	if(system_cpu_support_sse2()) {
+		data_init = kernel_cpu_sse2_data_init;
+	}
+	else
+#endif
+	{
+		data_init = kernel_cpu_data_init;
+	}
+
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+	for(int y = 0; y < dim.global_size[1]; y++) {
+		for(int x = 0; x < dim.global_size[0]; x++) {
+			kg->global_id = make_int2(x, y);
+
+			data_init((KernelGlobals*)kernel_globals.device_pointer,
+			          (KernelData*)data.device_pointer,
+			          (void*)split_data.device_pointer,
+			          num_global_elements,
+			          (char*)ray_state.device_pointer,
+			          (uint*)rtile.rng_state,
+			          rtile.start_sample,
+			          rtile.start_sample + rtile.num_samples,
+			          rtile.x,
+			          rtile.y,
+			          rtile.w,
+			          rtile.h,
+			          rtile.offset,
+			          rtile.stride,
+			          (int*)queue_index.device_pointer,
+			          dim.global_size[0] * dim.global_size[1],
+			          (char*)use_queues_flags.device_pointer,
+			          (uint*)work_pool_wgs.device_pointer,
+			          rtile.num_samples,
+			          (float*)rtile.buffer);
+		}
+	}
+
+	return true;
+}
+
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
+
+	kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+	if(!kernel->func) {
+		delete kernel;
+		return NULL;
+	}
+
+	return kernel;
+}
+
+int2 CPUSplitKernel::split_kernel_local_size()
+{
+	return make_int2(1, 1);
+}
+
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
+	return make_int2(64, 1);
+}
+
+uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+	return split_data_buffer_size(kg, num_threads);
+}
+
+unordered_map<string, void*> CPUDevice::kernel_functions;
+
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dafac6dfcb3..606494f08ed 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -15,32 +15,36 @@
  */
 
 #include <climits>
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
 #ifdef WITH_CUDA_DYNLOAD
 #  include "cuew.h"
 #else
-#  include "util_opengl.h"
+#  include "util/util_opengl.h"
 #  include <cuda.h>
 #  include <cudaGL.h>
 #endif
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_time.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_time.h"
+
+#include "kernel/split/kernel_split_data_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -78,6 +82,31 @@ int cuewCompilerVersion(void)
 }  /* namespace */
 #endif  /* WITH_CUDA_DYNLOAD */
 
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+	CUDADevice *device;
+public:
+	explicit CUDASplitKernel(CUDADevice *device);
+
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+};
+
 class CUDADevice : public Device
 {
 public:
@@ -258,16 +287,21 @@ public:
 		return DebugFlags().cuda.adaptive_compile;
 	}
 
+	bool use_split_kernel()
+	{
+		return DebugFlags().cuda.split_kernel;
+	}
+
 	/* Common NVCC flags which stays the same regardless of shading model,
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features)
+	        const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
-		const string kernel_path = path_get("kernel");
-		const string include = kernel_path;
+		const string source_path = path_get("source");
+		const string include_path = source_path;
 		string cflags = string_printf("-m%d "
 		                              "--ptxas-options=\"-v\" "
 		                              "--use_fast_math "
@@ -276,7 +310,7 @@ public:
 		                               "-I\"%s\"",
 		                              machine,
 		                              cuda_version,
-		                              include.c_str());
+		                              include_path.c_str());
 		if(use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
 		}
@@ -287,6 +321,11 @@ public:
 #ifdef WITH_CYCLES_DEBUG
 		cflags += " -D__KERNEL_DEBUG__";
 #endif
+
+		if(split) {
+			cflags += " -D__SPLIT__";
+		}
+
 		return cflags;
 	}
 
@@ -306,21 +345,21 @@ public:
 			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
 			return false;
 		}
-		if(cuda_version < 75) {
+		if(cuda_version < 80) {
 			printf("Unsupported CUDA version %d.%d detected, "
-			       "you need CUDA 7.5 or newer.\n",
+			       "you need CUDA 8.0 or newer.\n",
 			       major, minor);
 			return false;
 		}
-		else if(cuda_version != 75 && cuda_version != 80) {
+		else if(cuda_version != 80) {
 			printf("CUDA version %d.%d detected, build may succeed but only "
-			       "CUDA 7.5 and 8.0 are officially supported.\n",
+			       "CUDA 8.0 is officially supported.\n",
 			       major, minor);
 		}
 		return true;
 	}
 
-	string compile_kernel(const DeviceRequestedFeatures& requested_features)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		/* Compute cubin name. */
 		int major, minor;
@@ -329,7 +368,8 @@ public:
 
 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
+			                                                  : "lib/kernel_sm_%d%d.cubin",
 			                                            major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
@@ -339,18 +379,19 @@ public:
 		}
 
 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features);
+		        compile_kernel_get_common_cflags(requested_features, split);
 
 		/* Try to use locally compiled kernel. */
-		const string kernel_path = path_get("kernel");
-		const string kernel_md5 = path_files_md5_hash(kernel_path);
+		const string source_path = path_get("source");
+		const string kernel_md5 = path_files_md5_hash(source_path);
 
 		/* We include cflags into md5 so changing cuda toolkit or changing other
 		 * compiler command line arguments makes sure cubin gets re-built.
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
 
-		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
+		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
+		                                              : "cycles_kernel_sm%d%d_%s.cubin",
 		                                        major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
@@ -383,9 +424,10 @@ public:
 			return "";
 		}
 		const char *nvcc = cuewCompilerPath();
-		const string kernel = path_join(kernel_path,
-		                          path_join("kernels",
-		                                    path_join("cuda", "kernel.cu")));
+		const string kernel = path_join(
+		        path_join(source_path, "kernel"),
+		        path_join("kernels",
+		                  path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
 
@@ -433,7 +475,7 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(requested_features);
+		string cubin = compile_kernel(requested_features, use_split_kernel());
 
 		if(cubin == "")
 			return false;
@@ -466,8 +508,14 @@ public:
 		}
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		cuda_push_context();
 		CUdeviceptr device_pointer;
 		size_t size = mem.memory_size();
@@ -504,7 +552,9 @@ public:
 
 	void mem_zero(device_memory& mem)
 	{
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
+		if(mem.data_pointer) {
+			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		}
 
 		cuda_push_context();
 		if(mem.device_pointer)
@@ -617,7 +667,7 @@ public:
 		/* Data Storage */
 		if(interpolation == INTERPOLATION_NONE) {
 			if(has_bindless_textures) {
-				mem_alloc(mem, MEM_READ_ONLY);
+				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
 				cuda_push_context();
@@ -641,7 +691,7 @@ public:
 				cuda_pop_context();
 			}
 			else {
-				mem_alloc(mem, MEM_READ_ONLY);
+				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
 				cuda_push_context();
@@ -1258,25 +1308,48 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
 
-			/* keep rendering tiles until done */
-			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+			if(!use_split_kernel()) {
+				/* keep rendering tiles until done */
+				while(task->acquire_tile(this, tile)) {
+					int start_sample = tile.start_sample;
+					int end_sample = tile.start_sample + tile.num_samples;
 
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
-					}
+					for(int sample = start_sample; sample < end_sample; sample++) {
+						if(task->get_cancel()) {
+							if(task->need_finish_queue == false)
+								break;
+						}
 
-					path_trace(tile, sample, branched);
+						path_trace(tile, sample, branched);
 
-					tile.sample = sample + 1;
+						tile.sample = sample + 1;
 
-					task->update_progress(&tile, tile.w*tile.h);
+						task->update_progress(&tile, tile.w*tile.h);
+					}
+
+					task->release_tile(tile);
+				}
+			}
+			else {
+				DeviceRequestedFeatures requested_features;
+				if(!use_adaptive_compilation()) {
+					requested_features.max_closure = 64;
 				}
 
-				task->release_tile(tile);
+				CUDASplitKernel split_kernel(this);
+				split_kernel.load_kernels(requested_features);
+
+				while(task->acquire_tile(this, tile)) {
+					device_memory void_buffer;
+					split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+
+					task->release_tile(tile);
+
+					if(task->get_cancel()) {
+						if(task->need_finish_queue == false)
+							break;
+					}
+				}
 			}
 		}
 		else if(task->type == DeviceTask::SHADER) {
@@ -1329,8 +1402,223 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+	friend class CUDASplitKernelFunction;
+	friend class CUDASplitKernel;
+};
+
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+	{ \
+		CUresult result = stmt; \
+		\
+		if(result != CUDA_SUCCESS) { \
+			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			if(device->error_msg == "") \
+				device->error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+			/*cuda_abort();*/ \
+			device->cuda_error_documentation(); \
+		} \
+	} (void)0
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+	CUDADevice* device;
+	CUfunction func;
+public:
+	CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+	{
+		return enqueue(dim, NULL);
+	}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, void *args[])
+	{
+		device->cuda_push_context();
+
+		if(device->have_error())
+			return false;
+
+		/* we ignore dim.local_size for now, as this is faster */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
+
+		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+
+		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+		cuda_assert(cuLaunchKernel(func,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, args, 0));
+
+		device->cuda_pop_context();
+
+		return !device->have_error();
+	}
 };
 
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+	device_vector<uint64_t> size_buffer;
+	size_buffer.resize(1);
+	device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+	device->cuda_push_context();
+
+	uint threads = num_threads;
+	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+	struct args_t {
+		uint* num_threads;
+		CUdeviceptr* size;
+	};
+
+	args_t args = {
+		&threads,
+		&d_size
+	};
+
+	CUfunction state_buffer_size;
+	cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+	cuda_assert(cuLaunchKernel(state_buffer_size,
+	                           1, 1, 1,
+	                           1, 1, 1,
+	                           0, 0, (void**)&args, 0));
+
+	device->cuda_pop_context();
+
+	device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
+	device->mem_free(size_buffer);
+
+	return *size_buffer.get_data();
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                    RenderTile& rtile,
+                                    int num_global_elements,
+                                    device_memory& /*kernel_globals*/,
+                                    device_memory& /*kernel_data*/,
+                                    device_memory& split_data,
+                                    device_memory& ray_state,
+                                    device_memory& queue_index,
+                                    device_memory& use_queues_flag,
+                                    device_memory& work_pool_wgs)
+{
+	device->cuda_push_context();
+
+	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+	CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+	CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+	CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+	CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
+	CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+	int end_sample = rtile.start_sample + rtile.num_samples;
+	int queue_size = dim.global_size[0] * dim.global_size[1];
+
+	struct args_t {
+		CUdeviceptr* split_data_buffer;
+		int* num_elements;
+		CUdeviceptr* ray_state;
+		CUdeviceptr* rng_state;
+		int* start_sample;
+		int* end_sample;
+		int* sx;
+		int* sy;
+		int* sw;
+		int* sh;
+		int* offset;
+		int* stride;
+		CUdeviceptr* queue_index;
+		int* queuesize;
+		CUdeviceptr* use_queues_flag;
+		CUdeviceptr* work_pool_wgs;
+		int* num_samples;
+		CUdeviceptr* buffer;
+	};
+
+	args_t args = {
+		&d_split_data,
+		&num_global_elements,
+		&d_ray_state,
+		&d_rng_state,
+		&rtile.start_sample,
+		&end_sample,
+		&rtile.x,
+		&rtile.y,
+		&rtile.w,
+		&rtile.h,
+		&rtile.offset,
+		&rtile.stride,
+		&d_queue_index,
+		&queue_size,
+		&d_use_queues_flag,
+		&d_work_pool_wgs,
+		&rtile.num_samples,
+		&d_buffer
+	};
+
+	CUfunction data_init;
+	cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+	if(device->have_error()) {
+		return false;
+	}
+
+	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+	device->cuda_pop_context();
+
+	return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CUfunction func;
+
+	device->cuda_push_context();
+
+	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+	if(device->have_error()) {
+		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+		return NULL;
+	}
+
+	device->cuda_pop_context();
+
+	return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+	return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
+{
+	/* TODO(mai): implement something here to detect ideal work size */
+	return make_int2(256, 256);
+}
+
 bool device_cuda_init(void)
 {
 #ifdef WITH_CUDA_DYNLOAD
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b5b4dc6802..4b10514a9d2 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -28,10 +28,10 @@
  * other devices this is a pointer to device memory, where we will copy memory
  * to and from. */
 
-#include "util_debug.h"
-#include "util_half.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_debug.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,7 +48,8 @@ enum DataType {
 	TYPE_UINT,
 	TYPE_INT,
 	TYPE_FLOAT,
-	TYPE_HALF
+	TYPE_HALF,
+	TYPE_UINT64,
 };
 
 static inline size_t datatype_size(DataType datatype) 
@@ -59,6 +60,7 @@ static inline size_t datatype_size(DataType datatype)
 		case TYPE_UINT: return sizeof(uint);
 		case TYPE_INT: return sizeof(int);
 		case TYPE_HALF: return sizeof(half);
+		case TYPE_UINT64: return sizeof(uint64_t);
 		default: return 0;
 	}
 }
@@ -160,6 +162,11 @@ template<> struct device_type_traits<half4> {
 	static const int num_elements = 4;
 };
 
+template<> struct device_type_traits<uint64_t> {
+	static const DataType data_type = TYPE_UINT64;
+	static const int num_elements = 1;
+};
+
 /* Device Memory */
 
 class device_memory
@@ -180,10 +187,27 @@ public:
 	/* device pointer */
 	device_ptr device_pointer;
 
-protected:
-	device_memory() {}
+	device_memory()
+	{
+		data_type = device_type_traits<uchar>::data_type;
+		data_elements = device_type_traits<uchar>::num_elements;
+		data_pointer = 0;
+		data_size = 0;
+		device_size = 0;
+		data_width = 0;
+		data_height = 0;
+		data_depth = 0;
+		device_pointer = 0;
+	}
 	virtual ~device_memory() { assert(!device_pointer); }
 
+	void resize(size_t size)
+	{
+		data_size = size;
+		data_width = size;
+	}
+
+protected:
 	/* no copying */
 	device_memory(const device_memory&);
 	device_memory& operator = (const device_memory&);
@@ -198,16 +222,8 @@ public:
 	{
 		data_type = device_type_traits<T>::data_type;
 		data_elements = device_type_traits<T>::num_elements;
-		data_pointer = 0;
-		data_size = 0;
-		device_size = 0;
-		data_width = 0;
-		data_height = 0;
-		data_depth = 0;
 
 		assert(data_elements > 0);
-
-		device_pointer = 0;
 	}
 
 	virtual ~device_vector() {}
@@ -266,6 +282,7 @@ public:
 		data_height = 0;
 		data_depth = 0;
 		data_size = 0;
+		device_pointer = 0;
 	}
 
 	size_t size()
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 31b800640d3..624260a81c8 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -17,17 +17,17 @@
 #include <stdlib.h>
 #include <sstream>
 
-#include "device.h"
-#include "device_intern.h"
-#include "device_network.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_network.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_foreach.h"
-#include "util_list.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_time.h"
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -106,11 +106,11 @@ public:
 		return true;
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
-			sub.device->mem_alloc(mem, type);
+			sub.device->mem_alloc(name, mem, type);
 			sub.ptr_map[unique_ptr] = mem.device_pointer;
 		}
 
@@ -162,6 +162,7 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
+		stats.mem_free(mem.device_size);
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -170,7 +171,6 @@ public:
 		}
 
 		mem.device_pointer = 0;
-		stats.mem_free(mem.device_size);
 	}
 
 	void const_copy_to(const char *name, void *host, size_t size)
@@ -202,6 +202,7 @@ public:
 	void tex_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
+		stats.mem_free(mem.device_size);
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -210,7 +211,6 @@ public:
 		}
 
 		mem.device_pointer = 0;
-		stats.mem_free(mem.device_size);
 	}
 
 	void pixels_alloc(device_memory& mem)
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 53eef6cf199..66758954f44 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "device_intern.h"
-#include "device_network.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_network.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 #if defined(WITH_NETWORK)
 
@@ -87,8 +87,14 @@ public:
 		snd.write();
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+				    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+				    << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		thread_scoped_lock lock(rpc_lock);
 
 		mem.device_pointer = ++mem_counter;
@@ -481,7 +487,7 @@ protected:
 				mem.data_pointer = 0;
 
 			/* perform the allocation on the actual device */
-			device->mem_alloc(mem, type);
+			device->mem_alloc(NULL, mem, type);
 
 			/* store a mapping to/from client_pointer and real device pointer */
 			pointer_mapping_insert(client_pointer, mem.device_pointer);
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index d28cfe3121f..a5d24c66018 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -33,12 +33,12 @@
 #include <sstream>
 #include <deque>
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_foreach.h"
-#include "util_list.h"
-#include "util_map.h"
-#include "util_string.h"
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index ba94c592a5f..edd2047debc 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,12 +16,12 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl/opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "device_intern.h"
+#include "device/device_intern.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
new file mode 100644
index 00000000000..ae462a560b7
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_split_kernel.h"
+
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
+
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+static const double alpha = 0.1; /* alpha for rolling average */
+
+DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
+{
+	current_max_closure = -1;
+	first_tile = true;
+
+	avg_time_per_sample = 0.0;
+
+	kernel_path_init = NULL;
+	kernel_scene_intersect = NULL;
+	kernel_lamp_emission = NULL;
+	kernel_do_volume = NULL;
+	kernel_queue_enqueue = NULL;
+	kernel_indirect_background = NULL;
+	kernel_shader_eval = NULL;
+	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
+	kernel_subsurface_scatter = NULL;
+	kernel_direct_lighting = NULL;
+	kernel_shadow_blocked_ao = NULL;
+	kernel_shadow_blocked_dl = NULL;
+	kernel_next_iteration_setup = NULL;
+	kernel_indirect_subsurface = NULL;
+	kernel_buffer_update = NULL;
+}
+
+DeviceSplitKernel::~DeviceSplitKernel()
+{
+	device->mem_free(split_data);
+	device->mem_free(ray_state);
+	device->mem_free(use_queues_flag);
+	device->mem_free(queue_index);
+	device->mem_free(work_pool_wgs);
+
+	delete kernel_path_init;
+	delete kernel_scene_intersect;
+	delete kernel_lamp_emission;
+	delete kernel_do_volume;
+	delete kernel_queue_enqueue;
+	delete kernel_indirect_background;
+	delete kernel_shader_eval;
+	delete kernel_holdout_emission_blurring_pathtermination_ao;
+	delete kernel_subsurface_scatter;
+	delete kernel_direct_lighting;
+	delete kernel_shadow_blocked_ao;
+	delete kernel_shadow_blocked_dl;
+	delete kernel_next_iteration_setup;
+	delete kernel_indirect_subsurface;
+	delete kernel_buffer_update;
+}
+
+bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
+{
+#define LOAD_KERNEL(name) \
+		kernel_##name = get_split_kernel_function(#name, requested_features); \
+		if(!kernel_##name) { \
+			return false; \
+		}
+
+	LOAD_KERNEL(path_init);
+	LOAD_KERNEL(scene_intersect);
+	LOAD_KERNEL(lamp_emission);
+	LOAD_KERNEL(do_volume);
+	LOAD_KERNEL(queue_enqueue);
+	LOAD_KERNEL(indirect_background);
+	LOAD_KERNEL(shader_eval);
+	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+	LOAD_KERNEL(subsurface_scatter);
+	LOAD_KERNEL(direct_lighting);
+	LOAD_KERNEL(shadow_blocked_ao);
+	LOAD_KERNEL(shadow_blocked_dl);
+	LOAD_KERNEL(next_iteration_setup);
+	LOAD_KERNEL(indirect_subsurface);
+	LOAD_KERNEL(buffer_update);
+
+#undef LOAD_KERNEL
+
+	current_max_closure = requested_features.max_closure;
+
+	return true;
+}
+
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
+{
+	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+	return max_buffer_size / size_per_element;
+}
+
+bool DeviceSplitKernel::path_trace(DeviceTask *task,
+                                   RenderTile& tile,
+                                   device_memory& kgbuffer,
+                                   device_memory& kernel_data)
+{
+	if(device->have_error()) {
+		return false;
+	}
+
+	/* Get local size */
+	size_t local_size[2];
+	{
+		int2 lsize = split_kernel_local_size();
+		local_size[0] = lsize[0];
+		local_size[1] = lsize[1];
+	}
+
+	/* Set gloabl size */
+	size_t global_size[2];
+	{
+		int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+
+		/* Make sure that set work size is a multiple of local
+		 * work size dimensions.
+		 */
+		global_size[0] = round_up(gsize[0], local_size[0]);
+		global_size[1] = round_up(gsize[1], local_size[1]);
+	}
+
+	/* Number of elements in the global state buffer */
+	int num_global_elements = global_size[0] * global_size[1];
+	assert(num_global_elements % WORK_POOL_SIZE == 0);
+
+	/* Allocate all required global memory once. */
+	if(first_tile) {
+		first_tile = false;
+
+		/* Calculate max groups */
+
+		/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
+		unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1;
+
+		/* Allocate work_pool_wgs memory. */
+		work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+		device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
+
+		queue_index.resize(NUM_QUEUES * sizeof(int));
+		device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
+
+		use_queues_flag.resize(sizeof(char));
+		device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
+
+		ray_state.resize(num_global_elements);
+		device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
+
+		split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
+		device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
+	}
+
+#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
+		if(device->have_error()) { \
+			return false; \
+		} \
+		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
+			return false; \
+		}
+
+	tile.sample = tile.start_sample;
+
+	/* for exponential increase between tile updates */
+	int time_multiplier = 1;
+
+	while(tile.sample < tile.start_sample + tile.num_samples) {
+		/* to keep track of how long it takes to run a number of samples */
+		double start_time = time_dt();
+
+		/* initial guess to start rolling average */
+		const int initial_num_samples = 1;
+		/* approx number of samples per second */
+		int samples_per_second = (avg_time_per_sample > 0.0) ?
+		                         int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
+
+		RenderTile subtile = tile;
+		subtile.start_sample = tile.sample;
+		subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
+
+		if(device->have_error()) {
+			return false;
+		}
+
+		/* reset state memory here as global size for data_init
+		 * kernel might not be large enough to do in kernel
+		 */
+		device->mem_zero(work_pool_wgs);
+		device->mem_zero(split_data);
+		device->mem_zero(ray_state);
+
+		if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+		                                   subtile,
+		                                   num_global_elements,
+		                                   kgbuffer,
+		                                   kernel_data,
+		                                   split_data,
+		                                   ray_state,
+		                                   queue_index,
+		                                   use_queues_flag,
+		                                   work_pool_wgs))
+		{
+			return false;
+		}
+
+		ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
+
+		bool activeRaysAvailable = true;
+
+		while(activeRaysAvailable) {
+			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
+			for(int PathIter = 0; PathIter < 16; PathIter++) {
+				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
+
+				if(task->get_cancel()) {
+					return true;
+				}
+			}
+
+			/* Decide if we should exit path-iteration in host. */
+			device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
+
+			activeRaysAvailable = false;
+
+			for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
+				int8_t state = ray_state.get_data()[rayStateIter];
+
+				if(state != RAY_INACTIVE) {
+					if(state == RAY_INVALID) {
+						/* Something went wrong, abort to avoid looping endlessly. */
+						device->set_error("Split kernel error: invalid ray state");
+						return false;
+					}
+
+					/* Not all rays are RAY_INACTIVE. */
+					activeRaysAvailable = true;
+					break;
+				}
+			}
+
+			if(task->get_cancel()) {
+				return true;
+			}
+		}
+
+		double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
+
+		if(avg_time_per_sample == 0.0) {
+			/* start rolling average */
+			avg_time_per_sample = time_per_sample;
+		}
+		else {
+			avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
+		}
+
+#undef ENQUEUE_SPLIT_KERNEL
+
+		tile.sample += subtile.num_samples;
+		task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
+
+		time_multiplier = min(time_multiplier << 1, 10);
+
+		if(task->get_cancel()) {
+			return true;
+		}
+	}
+
+	return true;
+}
+
+CCL_NAMESPACE_END
+
+
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
new file mode 100644
index 00000000000..15a94953a11
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_SPLIT_KERNEL_H__
+#define __DEVICE_SPLIT_KERNEL_H__
+
+#include "device/device.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+/* Types used for split kernel */
+
+class KernelDimensions {
+public:
+	size_t global_size[2];
+	size_t local_size[2];
+
+	KernelDimensions(size_t global_size_[2], size_t local_size_[2])
+	{
+		memcpy(global_size, global_size_, sizeof(global_size));
+		memcpy(local_size, local_size_, sizeof(local_size));
+	}
+};
+
+class SplitKernelFunction {
+public:
+	virtual ~SplitKernelFunction() {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
+};
+
+class DeviceSplitKernel {
+private:
+	Device *device;
+
+	SplitKernelFunction *kernel_path_init;
+	SplitKernelFunction *kernel_scene_intersect;
+	SplitKernelFunction *kernel_lamp_emission;
+	SplitKernelFunction *kernel_do_volume;
+	SplitKernelFunction *kernel_queue_enqueue;
+	SplitKernelFunction *kernel_indirect_background;
+	SplitKernelFunction *kernel_shader_eval;
+	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
+	SplitKernelFunction *kernel_subsurface_scatter;
+	SplitKernelFunction *kernel_direct_lighting;
+	SplitKernelFunction *kernel_shadow_blocked_ao;
+	SplitKernelFunction *kernel_shadow_blocked_dl;
+	SplitKernelFunction *kernel_next_iteration_setup;
+	SplitKernelFunction *kernel_indirect_subsurface;
+	SplitKernelFunction *kernel_buffer_update;
+
+	/* Global memory variables [porting]; These memory is used for
+	 * co-operation between different kernels; Data written by one
+	 * kernel will be available to another kernel via this global
+	 * memory.
+	 */
+	device_memory split_data;
+	device_vector<uchar> ray_state;
+	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+
+	/* Flag to make sceneintersect and lampemission kernel use queues. */
+	device_memory use_queues_flag;
+
+	/* Approximate time it takes to complete one sample */
+	double avg_time_per_sample;
+
+	/* Work pool with respect to each work group. */
+	device_memory work_pool_wgs;
+
+	/* clos_max value for which the kernels have been loaded currently. */
+	int current_max_closure;
+
+	/* Marked True in constructor and marked false at the end of path_trace(). */
+	bool first_tile;
+
+public:
+	explicit DeviceSplitKernel(Device* device);
+	virtual ~DeviceSplitKernel();
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+	bool path_trace(DeviceTask *task,
+	                RenderTile& rtile,
+	                device_memory& kgbuffer,
+	                device_memory& kernel_data);
+
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+	size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs) = 0;
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
+	virtual int2 split_kernel_local_size() = 0;
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_SPLIT_KERNEL_H__ */
+
+
+
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 48d18035c13..ca303365627 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -17,12 +17,12 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "device_task.h"
+#include "device/device_task.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_algorithm.h"
-#include "util_time.h"
+#include "util/util_algorithm.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 8bd54c3d2b0..feee89fd6e4 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -17,11 +17,11 @@
 #ifndef __DEVICE_TASK_H__
 #define __DEVICE_TASK_H__
 
-#include "device_memory.h"
+#include "device/device_memory.h"
 
-#include "util_function.h"
-#include "util_list.h"
-#include "util_task.h"
+#include "util/util_function.h"
+#include "util/util_list.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -51,6 +51,8 @@ public:
 	int shader_filter;
 	int shader_x, shader_w;
 
+	int passes_size;
+
 	explicit DeviceTask(Type type = PATH_TRACE);
 
 	int get_subtask_count(int num, int max_size = 0);
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 4023ba89a10..764216d0dfa 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -16,39 +16,39 @@
 
 #ifdef WITH_OPENCL
 
-#include "device.h"
+#include "device/device.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
 
 #include "clew.h"
 
 CCL_NAMESPACE_BEGIN
 
-#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-/* Macro declarations used with split kernel */
-
-/* Macro to enable/disable work-stealing */
-#define __WORK_STEALING__
-
-#define SPLIT_KERNEL_LOCAL_SIZE_X 64
-#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
+#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
+/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
+#  undef clEnqueueNDRangeKernel
+#  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+
+#  undef clEnqueueWriteBuffer
+#  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+
+#  undef clEnqueueReadBuffer
+#  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+#endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
 
-/* This value may be tuned according to the scene we are rendering.
- *
- * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
- * ray-bounces will improve performance.
- */
-#define PATH_ITER_INC_FACTOR 8
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
 
 struct OpenCLPlatformDevice {
 	OpenCLPlatformDevice(cl_platform_id platform_id,
@@ -90,6 +90,54 @@ public:
 	                              cl_device_id device_id);
 	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
 	                               bool force_all = false);
+	static bool use_single_program();
+
+	/* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
+
+	/* Platform information. */
+	static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
+	static cl_uint get_num_platforms();
+
+	static bool get_platforms(vector<cl_platform_id> *platform_ids,
+	                          cl_int *error = NULL);
+	static vector<cl_platform_id> get_platforms();
+
+	static bool get_platform_name(cl_platform_id platform_id,
+	                              string *platform_name);
+	static string get_platform_name(cl_platform_id platform_id);
+
+	static bool get_num_platform_devices(cl_platform_id platform_id,
+	                                     cl_device_type device_type,
+	                                     cl_uint *num_devices,
+	                                     cl_int *error = NULL);
+	static cl_uint get_num_platform_devices(cl_platform_id platform_id,
+	                                        cl_device_type device_type);
+
+	static bool get_platform_devices(cl_platform_id platform_id,
+	                                 cl_device_type device_type,
+	                                 vector<cl_device_id> *device_ids,
+	                                 cl_int* error = NULL);
+	static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
+	                                                 cl_device_type device_type);
+
+	/* Device information. */
+	static bool get_device_name(cl_device_id device_id,
+	                            string *device_name,
+	                            cl_int* error = NULL);
+
+	static string get_device_name(cl_device_id device_id);
+
+	static bool get_device_type(cl_device_id device_id,
+	                            cl_device_type *device_type,
+	                            cl_int* error = NULL);
+	static cl_device_type get_device_type(cl_device_id device_id);
+
+	/* Get somewhat more readable device name.
+	 * Main difference is AMD OpenCL here which only gives code name
+	 * for the regular device name. This will give more sane device
+	 * name using some extensions.
+	 */
+	static string get_readable_device_name(cl_device_id device_id);
 };
 
 /* Thread safe cache for contexts and programs.
@@ -248,6 +296,7 @@ public:
 
 	bool device_initialized;
 	string platform_name;
+	string device_name;
 
 	bool opencl_error(cl_int err);
 	void opencl_error(const string& message);
@@ -266,10 +315,10 @@ public:
 
 	/* Has to be implemented by the real device classes.
 	 * The base device will then load all these programs. */
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
 	                          vector<OpenCLProgram*> &programs) = 0;
 
-	void mem_alloc(device_memory& mem, MemoryType type);
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type);
 	void mem_copy_to(device_memory& mem);
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
@@ -326,16 +375,39 @@ protected:
 
 	class ArgumentWrapper {
 	public:
-		ArgumentWrapper() : size(0), pointer(NULL) {}
-		template <typename T>
+		ArgumentWrapper() : size(0), pointer(NULL)
+		{
+		}
+
+		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
+		                                           pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
+		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
+		                                              pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
 		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument) { }
+		                               pointer(&argument)
+		{
+		}
+
 		ArgumentWrapper(int argument) : size(sizeof(int)),
 		                                int_value(argument),
-		                                pointer(&int_value) { }
+		                                pointer(&int_value)
+		{
+		}
+
 		ArgumentWrapper(float argument) : size(sizeof(float)),
 		                                  float_value(argument),
-		                                  pointer(&float_value) { }
+		                                  pointer(&float_value)
+		{
+		}
+
 		size_t size;
 		int int_value;
 		float float_value;
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index a2b900312e7..52d0662a8e3 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -16,15 +16,15 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -82,9 +82,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cpPlatform = platform_device.platform_id;
 	cdDevice = platform_device.device_id;
 	platform_name = platform_device.platform_name;
+	device_name = platform_device.device_name;
 	VLOG(2) << "Creating new Cycles device for OpenCL platform "
 	        << platform_name << ", device "
-	        << platform_device.device_name << ".";
+	        << device_name << ".";
 
 	{
 		/* try to use cached context */
@@ -113,12 +114,16 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	}
 
 	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating command queue");
 		return;
+	}
 
 	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating memory buffer for NULL");
 		return;
+	}
 
 	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
@@ -147,10 +152,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
 	const void * /*private_info*/, size_t /*cb*/, void *user_data)
 {
-	char name[256];
-	clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-
-	fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
+	string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
+	fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
 }
 
 bool OpenCLDeviceBase::opencl_version_check()
@@ -191,6 +194,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
 
 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
 {
+	VLOG(2) << "Loading kernels for platform " << platform_name
+	        << ", device " << device_name << ".";
 	/* Verify if device was initialized. */
 	if(!device_initialized) {
 		fprintf(stderr, "OpenCL: failed to initialize device.\n");
@@ -206,11 +211,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program.add_kernel(ustring("convert_to_half_float"));
 	base_program.add_kernel(ustring("shader"));
 	base_program.add_kernel(ustring("bake"));
+	base_program.add_kernel(ustring("zero_buffer"));
 
 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
 	/* Call actual class to fill the vector with its programs. */
-	load_kernels(requested_features, programs);
+	if(!load_kernels(requested_features, programs)) {
+		return false;
+	}
 
 	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
 	 * serialize the calls internally, so it's not much use right now.
@@ -242,8 +250,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	return true;
 }
 
-void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
+void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryType type)
 {
+	if(name) {
+		VLOG(1) << "Buffer allocate: " << name << ", "
+			    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			    << string_human_readable_size(mem.memory_size()) << ")";
+	}
+
 	size_t size = mem.memory_size();
 
 	cl_mem_flags mem_flag;
@@ -311,8 +325,61 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 void OpenCLDeviceBase::mem_zero(device_memory& mem)
 {
 	if(mem.device_pointer) {
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
-		mem_copy_to(mem);
+		if(base_program.is_loaded()) {
+			cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+			size_t global_size[] = {1024, 1024};
+			size_t num_threads = global_size[0] * global_size[1];
+
+			cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
+			cl_ulong d_offset = 0;
+			cl_ulong d_size = 0;
+
+			while(d_offset < mem.memory_size()) {
+				d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+
+				kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+				ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+				                               ckZeroBuffer,
+				                               2,
+				                               NULL,
+				                               global_size,
+				                               NULL,
+				                               0,
+				                               NULL,
+				                               NULL);
+				opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+				d_offset += d_size;
+			}
+		}
+
+		if(mem.data_pointer) {
+			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		}
+
+		if(!base_program.is_loaded()) {
+			void* zero = (void*)mem.data_pointer;
+
+			if(!mem.data_pointer) {
+				zero = util_aligned_malloc(mem.memory_size(), 16);
+				memset(zero, 0, mem.memory_size());
+			}
+
+			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+			                                   CL_MEM_PTR(mem.device_pointer),
+			                                   CL_TRUE,
+			                                   0,
+			                                   mem.memory_size(),
+			                                   zero,
+			                                   0,
+			                                   NULL, NULL));
+
+			if(!mem.data_pointer) {
+				util_aligned_free(zero);
+			}
+		}
 	}
 }
 
@@ -337,7 +404,7 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 		device_vector<uchar> *data = new device_vector<uchar>();
 		data->copy((uchar*)host, size);
 
-		mem_alloc(*data, MEM_READ_ONLY);
+		mem_alloc(name, *data, MEM_READ_ONLY);
 		i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
 	}
 	else {
@@ -356,7 +423,7 @@ void OpenCLDeviceBase::tex_alloc(const char *name,
 	VLOG(1) << "Texture allocate: " << name << ", "
 	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 	        << string_human_readable_size(mem.memory_size()) << ")";
-	mem_alloc(mem, MEM_READ_ONLY);
+	mem_alloc(NULL, mem, MEM_READ_ONLY);
 	mem_copy_to(mem);
 	assert(mem_map.find(name) == mem_map.end());
 	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
@@ -460,7 +527,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_
 
 #define KERNEL_TEX(type, ttype, name) \
 set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
 
 	start_arg_index += kernel_set_args(ckFilmConvertKernel,
@@ -511,7 +578,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
 
 #define KERNEL_TEX(type, ttype, name) \
 	set_kernel_arg_mem(kernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
 
 	start_arg_index += kernel_set_args(kernel,
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 6ea7619e022..a2fd1d71156 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -16,15 +16,15 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,11 +43,12 @@ public:
 		return true;
 	}
 
-	virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
 	                          vector<OpenCLProgram*> &programs)
 	{
 		path_trace_program.add_kernel(ustring("path_trace"));
 		programs.push_back(&path_trace_program);
+		return true;
 	}
 
 	~OpenCLDeviceMegaKernel()
@@ -83,7 +84,7 @@ public:
 
 #define KERNEL_TEX(type, ttype, name) \
 		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
 
 		start_arg_index += kernel_set_args(ckPathTraceKernel,
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 3c3c2150128..579dbc84f53 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -16,1290 +16,359 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
 
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "device/device_split_kernel.h"
+
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* TODO(sergey): This is to keep tile split on OpenCL level working
- * for now, since without this view-port render does not work as it
- * should.
- *
- * Ideally it'll be done on the higher level, but we need to get ready
- * for merge rather soon, so let's keep split logic private here in
- * the file.
- */
-class SplitRenderTile : public RenderTile {
-public:
-	SplitRenderTile()
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0) {}
-
-	explicit SplitRenderTile(RenderTile& tile)
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0)
-	{
-		x = tile.x;
-		y = tile.y;
-		w = tile.w;
-		h = tile.h;
-		start_sample = tile.start_sample;
-		num_samples = tile.num_samples;
-		sample = tile.sample;
-		resolution = tile.resolution;
-		offset = tile.offset;
-		stride = tile.stride;
-		buffer = tile.buffer;
-		rng_state = tile.rng_state;
-		buffers = tile.buffers;
+class OpenCLSplitKernel;
+
+static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
+{
+	string build_options = "-D__SPLIT_KERNEL__ ";
+	build_options += requested_features.get_build_options();
+
+	/* Set compute device build option. */
+	cl_device_type device_type;
+	OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr);
+	assert(device->ciErr == CL_SUCCESS);
+	if(device_type == CL_DEVICE_TYPE_GPU) {
+		build_options += " -D__COMPUTE_DEVICE_GPU__";
 	}
 
-	/* Split kernel is device global memory constrained;
-	 * hence split kernel cant render big tile size's in
-	 * one go. If the user sets a big tile size (big tile size
-	 * is a term relative to the available device global memory),
-	 * we split the tile further and then call path_trace on
-	 * each of those split tiles. The following variables declared,
-	 * assist in achieving that purpose
-	 */
-	int buffer_offset_x;
-	int buffer_offset_y;
-	int rng_state_offset_x;
-	int rng_state_offset_y;
-	int buffer_rng_state_stride;
-};
+	return build_options;
+}
 
 /* OpenCLDeviceSplitKernel's declaration/definition. */
 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
 {
 public:
-	/* Kernel declaration. */
+	DeviceSplitKernel *split_kernel;
 	OpenCLProgram program_data_init;
-	OpenCLProgram program_scene_intersect;
-	OpenCLProgram program_lamp_emission;
-	OpenCLProgram program_queue_enqueue;
-	OpenCLProgram program_background_buffer_update;
-	OpenCLProgram program_shader_eval;
-	OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-	OpenCLProgram program_direct_lighting;
-	OpenCLProgram program_shadow_blocked;
-	OpenCLProgram program_next_iteration_setup;
-	OpenCLProgram program_sum_all_radiance;
-
-	/* Global memory variables [porting]; These memory is used for
-	 * co-operation between different kernels; Data written by one
-	 * kernel will be available to another kernel via this global
-	 * memory.
-	 */
-	cl_mem rng_coop;
-	cl_mem throughput_coop;
-	cl_mem L_transparent_coop;
-	cl_mem PathRadiance_coop;
-	cl_mem Ray_coop;
-	cl_mem PathState_coop;
-	cl_mem Intersection_coop;
-	cl_mem kgbuffer;  /* KernelGlobals buffer. */
-
-	/* Global buffers for ShaderData. */
-	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
-	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
-	                        * shadow_blocked kernel.
-	                        */
-
-	/* Global memory required for shadow blocked and accum_radiance. */
-	cl_mem BSDFEval_coop;
-	cl_mem ISLamp_coop;
-	cl_mem LightRay_coop;
-	cl_mem AOAlpha_coop;
-	cl_mem AOBSDF_coop;
-	cl_mem AOLightRay_coop;
-	cl_mem Intersection_coop_shadow;
-
-#ifdef WITH_CYCLES_DEBUG
-	/* DebugData memory */
-	cl_mem debugdata_coop;
-#endif
-
-	/* Global state array that tracks ray state. */
-	cl_mem ray_state;
-
-	/* Per sample buffers. */
-	cl_mem per_sample_output_buffers;
-
-	/* Denotes which sample each ray is being processed for. */
-	cl_mem work_array;
-
-	/* Queue */
-	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
-	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
-	                     * Tracks the size of each queue.
-	                     */
-
-	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	cl_mem use_queues_flag;
-
-	/* Amount of memory in output buffer associated with one pixel/thread. */
-	size_t per_thread_output_buffer_size;
-
-	/* Total allocatable available device memory. */
-	size_t total_allocatable_memory;
-
-	/* host version of ray_state; Used in checking host path-iteration
-	 * termination.
-	 */
-	char *hostRayStateArray;
-
-	/* Number of path-iterations to be done in one shot. */
-	unsigned int PathIteration_times;
-
-#ifdef __WORK_STEALING__
-	/* Work pool with respect to each work group. */
-	cl_mem work_pool_wgs;
-
-	/* Denotes the maximum work groups possible w.r.t. current tile size. */
-	unsigned int max_work_groups;
-#endif
-
-	/* clos_max value for which the kernels have been loaded currently. */
-	int current_max_closure;
-
-	/* Marked True in constructor and marked false at the end of path_trace(). */
-	bool first_tile;
-
-	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
-	: OpenCLDeviceBase(info, stats, background_)
-	{
-		background = background_;
-
-		/* Initialize cl_mem variables. */
-		kgbuffer = NULL;
-		sd = NULL;
-		sd_DL_shadow = NULL;
-
-		rng_coop = NULL;
-		throughput_coop = NULL;
-		L_transparent_coop = NULL;
-		PathRadiance_coop = NULL;
-		Ray_coop = NULL;
-		PathState_coop = NULL;
-		Intersection_coop = NULL;
-		ray_state = NULL;
-
-		AOAlpha_coop = NULL;
-		AOBSDF_coop = NULL;
-		AOLightRay_coop = NULL;
-		BSDFEval_coop = NULL;
-		ISLamp_coop = NULL;
-		LightRay_coop = NULL;
-		Intersection_coop_shadow = NULL;
-
-#ifdef WITH_CYCLES_DEBUG
-		debugdata_coop = NULL;
-#endif
-
-		work_array = NULL;
-
-		/* Queue. */
-		Queue_data = NULL;
-		Queue_index = NULL;
-		use_queues_flag = NULL;
-
-		per_sample_output_buffers = NULL;
-
-		per_thread_output_buffer_size = 0;
-		hostRayStateArray = NULL;
-		PathIteration_times = PATH_ITER_INC_FACTOR;
-#ifdef __WORK_STEALING__
-		work_pool_wgs = NULL;
-		max_work_groups = 0;
-#endif
-		current_max_closure = -1;
-		first_tile = true;
-
-		/* Get device's maximum memory that can be allocated. */
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-		                        sizeof(size_t),
-		                        &total_allocatable_memory,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(platform_name == "AMD Accelerated Parallel Processing") {
-			/* This value is tweak-able; AMD platform does not seem to
-			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
-			 * is considered for further computation.
-			 */
-			total_allocatable_memory /= 2;
-		}
-	}
+	OpenCLProgram program_state_buffer_size;
 
-	virtual bool show_samples() const {
-		return false;
-	}
+	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
 
-	/* Split kernel utility functions. */
-	size_t get_tex_size(const char *tex_name)
+	~OpenCLDeviceSplitKernel()
 	{
-		cl_mem ptr;
-		size_t ret_size = 0;
-		MemMap::iterator i = mem_map.find(tex_name);
-		if(i != mem_map.end()) {
-			ptr = CL_MEM_PTR(i->second);
-			ciErr = clGetMemObjectInfo(ptr,
-			                           CL_MEM_SIZE,
-			                           sizeof(ret_size),
-			                           &ret_size,
-			                           NULL);
-			assert(ciErr == CL_SUCCESS);
-		}
-		return ret_size;
+		task_pool.stop();
+
+		/* Release kernels */
+		program_data_init.release();
+
+		delete split_kernel;
 	}
 
-	size_t get_shader_data_size(size_t max_closure)
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
+	                          vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
 	{
-		/* ShaderData size with variable size ShaderClosure array */
-		return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
+		bool single_program = OpenCLInfo::use_single_program();
+		program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  single_program ? "split" : "split_data_init",
+		                                  single_program ? "kernel_split.cl" : "kernel_data_init.cl",
+		                                  get_build_options(this, requested_features));
+
+		program_data_init.add_kernel(ustring("path_trace_data_init"));
+		programs.push_back(&program_data_init);
+
+		program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  single_program ? "split" : "split_state_buffer_size",
+		                                  single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl",
+		                                  get_build_options(this, requested_features));
+		program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
+		programs.push_back(&program_state_buffer_size);
+
+		return split_kernel->load_kernels(requested_features);
 	}
 
-	/* Returns size of KernelGlobals structure associated with OpenCL. */
-	size_t get_KernelGlobals_size()
+	void thread_run(DeviceTask *task)
 	{
-		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
-		 * fetch its size.
-		 */
-		typedef struct KernelGlobals {
-			ccl_constant KernelData *data;
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+		}
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
+		}
+		else if(task->type == DeviceTask::PATH_TRACE) {
+			RenderTile tile;
+
+			/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+			 * fetch its size.
+			 */
+			typedef struct KernelGlobals {
+				ccl_constant KernelData *data;
 #define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
-#include "kernel_textures.h"
+				ccl_global type *name;
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
-			void *sd_input;
-			void *isect_shadow;
-		} KernelGlobals;
+				SplitData split_data;
+				SplitParams split_param_data;
+			} KernelGlobals;
+
+			/* Allocate buffer for kernel globals */
+			device_memory kgbuffer;
+			kgbuffer.resize(sizeof(KernelGlobals));
+			mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
+
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				split_kernel->path_trace(task,
+				                         tile,
+				                         kgbuffer,
+				                         *const_mem_map["__data"]);
+
+				/* Complete kernel execution before release tile. */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
 
-		return sizeof(KernelGlobals);
+				task->release_tile(tile);
+			}
+
+			mem_free(kgbuffer);
+		}
+	}
+
+protected:
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+	string build_options_for_base_program(
+	        const DeviceRequestedFeatures& requested_features)
+	{
+		return requested_features.get_build_options();
 	}
 
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
-	                          vector<OpenCLProgram*> &programs)
+	friend class OpenCLSplitKernel;
+	friend class OpenCLSplitKernelFunction;
+};
+
+class OpenCLSplitKernelFunction : public SplitKernelFunction {
+public:
+	OpenCLDeviceSplitKernel* device;
+	OpenCLDeviceBase::OpenCLProgram program;
+
+	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
+	~OpenCLSplitKernelFunction() { program.release(); }
+
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
 	{
-		string build_options = "-D__SPLIT_KERNEL__ ";
-#ifdef __WORK_STEALING__
-		build_options += "-D__WORK_STEALING__ ";
-#endif
-		build_options += requested_features.get_build_options();
-
-		/* Set compute device build option. */
-		cl_device_type device_type;
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_TYPE,
-		                        sizeof(cl_device_type),
-		                        &device_type,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(device_type == CL_DEVICE_TYPE_GPU) {
-			build_options += " -D__COMPUTE_DEVICE_GPU__";
+		device->kernel_set_args(program(), 0, kg, data);
+
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                                       program(),
+		                                       2,
+		                                       NULL,
+		                                       dim.global_size,
+		                                       dim.local_size,
+		                                       0,
+		                                       NULL,
+		                                       NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return false;
 		}
 
-#define GLUE(a, b) a ## b
-#define LOAD_KERNEL(name) \
-	do { \
-		GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \
-		GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \
-		programs.push_back(&GLUE(program_, name)); \
-	} while(false)
-
-		LOAD_KERNEL(data_init);
-		LOAD_KERNEL(scene_intersect);
-		LOAD_KERNEL(lamp_emission);
-		LOAD_KERNEL(queue_enqueue);
-		LOAD_KERNEL(background_buffer_update);
-		LOAD_KERNEL(shader_eval);
-		LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-		LOAD_KERNEL(direct_lighting);
-		LOAD_KERNEL(shadow_blocked);
-		LOAD_KERNEL(next_iteration_setup);
-		LOAD_KERNEL(sum_all_radiance);
-
-#undef FIND_KERNEL
-#undef GLUE
-
-		current_max_closure = requested_features.max_closure;
+		return true;
 	}
+};
 
-	~OpenCLDeviceSplitKernel()
+class OpenCLSplitKernel : public DeviceSplitKernel {
+	OpenCLDeviceSplitKernel *device;
+public:
+	explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
+	}
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
+	                                                       const DeviceRequestedFeatures& requested_features)
 	{
-		task_pool.stop();
+		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
 
-		/* Release kernels */
-		program_data_init.release();
-		program_scene_intersect.release();
-		program_lamp_emission.release();
-		program_queue_enqueue.release();
-		program_background_buffer_update.release();
-		program_shader_eval.release();
-		program_holdout_emission_blurring_pathtermination_ao.release();
-		program_direct_lighting.release();
-		program_shadow_blocked.release();
-		program_next_iteration_setup.release();
-		program_sum_all_radiance.release();
-
-		/* Release global memory */
-		release_mem_object_safe(rng_coop);
-		release_mem_object_safe(throughput_coop);
-		release_mem_object_safe(L_transparent_coop);
-		release_mem_object_safe(PathRadiance_coop);
-		release_mem_object_safe(Ray_coop);
-		release_mem_object_safe(PathState_coop);
-		release_mem_object_safe(Intersection_coop);
-		release_mem_object_safe(kgbuffer);
-		release_mem_object_safe(sd);
-		release_mem_object_safe(sd_DL_shadow);
-		release_mem_object_safe(ray_state);
-		release_mem_object_safe(AOAlpha_coop);
-		release_mem_object_safe(AOBSDF_coop);
-		release_mem_object_safe(AOLightRay_coop);
-		release_mem_object_safe(BSDFEval_coop);
-		release_mem_object_safe(ISLamp_coop);
-		release_mem_object_safe(LightRay_coop);
-		release_mem_object_safe(Intersection_coop_shadow);
-#ifdef WITH_CYCLES_DEBUG
-		release_mem_object_safe(debugdata_coop);
-#endif
-		release_mem_object_safe(use_queues_flag);
-		release_mem_object_safe(Queue_data);
-		release_mem_object_safe(Queue_index);
-		release_mem_object_safe(work_array);
-#ifdef __WORK_STEALING__
-		release_mem_object_safe(work_pool_wgs);
-#endif
-		release_mem_object_safe(per_sample_output_buffers);
-
-		if(hostRayStateArray != NULL) {
-			free(hostRayStateArray);
+		bool single_program = OpenCLInfo::use_single_program();
+		kernel->program =
+			OpenCLDeviceBase::OpenCLProgram(device,
+			                                single_program ? "split" : "split_" + kernel_name,
+			                                single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl",
+			                                get_build_options(device, requested_features));
+
+		kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
+		kernel->program.load();
+
+		if(!kernel->program.is_loaded()) {
+			delete kernel;
+			return NULL;
 		}
+
+		return kernel;
 	}
 
-	void path_trace(DeviceTask *task,
-	                SplitRenderTile& rtile,
-	                int2 max_render_feasible_tile_size)
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
 	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-		cl_int d_x = rtile.x;
-		cl_int d_y = rtile.y;
-		cl_int d_w = rtile.w;
-		cl_int d_h = rtile.h;
-		cl_int d_offset = rtile.offset;
-		cl_int d_stride = rtile.stride;
-
-		/* Make sure that set render feasible tile size is a multiple of local
-		 * work size dimensions.
-		 */
-		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
-		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+		device_vector<uint64_t> size_buffer;
+		size_buffer.resize(1);
+		device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+		uint threads = num_threads;
+		device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
+
+		size_t global_size = 64;
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_state_buffer_size(),
+		                               1,
+		                               NULL,
+		                               &global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
+		device->mem_free(size_buffer);
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return 0;
+		}
+
+		return *size_buffer.get_data();
+	}
 
-		size_t global_size[2];
-		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
-		                        SPLIT_KERNEL_LOCAL_SIZE_Y};
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs
+	                                            )
+	{
+		cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
 
 		/* Set the range of samples to be processed for every ray in
 		 * path-regeneration logic.
 		 */
 		cl_int start_sample = rtile.start_sample;
 		cl_int end_sample = rtile.start_sample + rtile.num_samples;
-		cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
-		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_parallel_samples = 1;
-#else
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_threads = max_render_feasible_tile_size.x *
-		                           max_render_feasible_tile_size.y;
-		unsigned int num_tile_columns_possible = num_threads / global_size[1];
-		/* Estimate number of parallel samples that can be
-		 * processed in parallel.
-		 */
-		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
-		                                        rtile.num_samples);
-		/* Wavefront size in AMD is 64.
-		 * TODO(sergey): What about other platforms?
-		 */
-		if(num_parallel_samples >= 64) {
-			/* TODO(sergey): Could use generic round-up here. */
-			num_parallel_samples = (num_parallel_samples / 64) * 64;
-		}
-		assert(num_parallel_samples != 0);
-
-		global_size[0] = d_w * num_parallel_samples;
-#endif  /* __WORK_STEALING__ */
-
-		assert(global_size[0] * global_size[1] <=
-		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
-
-		/* Allocate all required global memory once. */
-		if(first_tile) {
-			size_t num_global_elements = max_render_feasible_tile_size.x *
-			                             max_render_feasible_tile_size.y;
-			/* TODO(sergey): This will actually over-allocate if
-			 * particular kernel does not support multiclosure.
-			 */
-			size_t shaderdata_size = get_shader_data_size(current_max_closure);
-
-#ifdef __WORK_STEALING__
-			/* Calculate max groups */
-			size_t max_global_size[2];
-			size_t tile_x = max_render_feasible_tile_size.x;
-			size_t tile_y = max_render_feasible_tile_size.y;
-			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
-			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
-			max_work_groups = (max_global_size[0] * max_global_size[1]) /
-			                  (local_size[0] * local_size[1]);
-			/* Allocate work_pool_wgs memory. */
-			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
-#endif  /* __WORK_STEALING__ */
-
-			/* Allocate queue_index memory only once. */
-			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
-			use_queues_flag = mem_alloc(sizeof(char));
-			kgbuffer = mem_alloc(get_KernelGlobals_size());
-
-			/* Create global buffers for ShaderData. */
-			sd = mem_alloc(num_global_elements * shaderdata_size);
-			sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
-
-			/* Creation of global memory buffers which are shared among
-			 * the kernels.
-			 */
-			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
-			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
-			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
-			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
-			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
-			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
-			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
-			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
-			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
-
-#ifdef WITH_CYCLES_DEBUG
-			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
-#endif
-
-			ray_state = mem_alloc(num_global_elements * sizeof(char));
-
-			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
-			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
-
-			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
-			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
-			per_sample_output_buffers = mem_alloc(num_global_elements *
-			                                      per_thread_output_buffer_size);
-		}
-
-		cl_int dQueue_size = global_size[0] * global_size[1];
 
 		cl_uint start_arg_index =
-			kernel_set_args(program_data_init(),
+			device->kernel_set_args(device->program_data_init(),
 			                0,
-			                kgbuffer,
-			                sd_DL_shadow,
-			                d_data,
-			                per_sample_output_buffers,
-			                d_rng_state,
-			                rng_coop,
-			                throughput_coop,
-			                L_transparent_coop,
-			                PathRadiance_coop,
-			                Ray_coop,
-			                PathState_coop,
-			                Intersection_coop_shadow,
-			                ray_state);
+			                kernel_globals,
+			                kernel_data,
+			                split_data,
+			                num_global_elements,
+			                ray_state,
+			                rtile.rng_state);
 
 /* TODO(sergey): Avoid map lookup here. */
 #define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(program_data_init(), &start_arg_index, #name);
-#include "kernel_textures.h"
+	device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
 
 		start_arg_index +=
-			kernel_set_args(program_data_init(),
+			device->kernel_set_args(device->program_data_init(),
 			                start_arg_index,
 			                start_sample,
-			                d_x,
-			                d_y,
-			                d_w,
-			                d_h,
-			                d_offset,
-			                d_stride,
-			                rtile.rng_state_offset_x,
-			                rtile.rng_state_offset_y,
-			                rtile.buffer_rng_state_stride,
-			                Queue_data,
-			                Queue_index,
+			                end_sample,
+			                rtile.x,
+			                rtile.y,
+			                rtile.w,
+			                rtile.h,
+			                rtile.offset,
+			                rtile.stride,
+			                queue_index,
 			                dQueue_size,
 			                use_queues_flag,
-			                work_array,
-#ifdef __WORK_STEALING__
 			                work_pool_wgs,
-			                num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-			                debugdata_coop,
-#endif
-			                num_parallel_samples);
-
-		kernel_set_args(program_scene_intersect(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-#ifdef WITH_CYCLES_DEBUG
-		                debugdata_coop,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(program_lamp_emission(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-		                num_parallel_samples);
-
-		kernel_set_args(program_queue_enqueue(),
-		                0,
-		                Queue_data,
-		                Queue_index,
-		                ray_state,
-		                dQueue_size);
-
-		kernel_set_args(program_background_buffer_update(),
-		                 0,
-		                 kgbuffer,
-		                 d_data,
-		                 per_sample_output_buffers,
-		                 d_rng_state,
-		                 rng_coop,
-		                 throughput_coop,
-		                 PathRadiance_coop,
-		                 Ray_coop,
-		                 PathState_coop,
-		                 L_transparent_coop,
-		                 ray_state,
-		                 d_w,
-		                 d_h,
-		                 d_x,
-		                 d_y,
-		                 d_stride,
-		                 rtile.rng_state_offset_x,
-		                 rtile.rng_state_offset_y,
-		                 rtile.buffer_rng_state_stride,
-		                 work_array,
-		                 Queue_data,
-		                 Queue_index,
-		                 dQueue_size,
-		                 end_sample,
-		                 start_sample,
-#ifdef __WORK_STEALING__
-		                 work_pool_wgs,
-		                 num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-		                 debugdata_coop,
-#endif
-		                 num_parallel_samples);
-
-		kernel_set_args(program_shader_eval(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                per_sample_output_buffers,
-		                rng_coop,
-		                throughput_coop,
-		                L_transparent_coop,
-		                PathRadiance_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                AOAlpha_coop,
-		                AOBSDF_coop,
-		                AOLightRay_coop,
-		                d_w,
-		                d_h,
-		                d_x,
-		                d_y,
-		                d_stride,
-		                ray_state,
-		                work_array,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-#ifdef __WORK_STEALING__
-		                start_sample,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(program_direct_lighting(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                PathState_coop,
-		                ISLamp_coop,
-		                LightRay_coop,
-		                BSDFEval_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_shadow_blocked(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                PathState_coop,
-		                LightRay_coop,
-		                AOLightRay_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_next_iteration_setup(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                LightRay_coop,
-		                ISLamp_coop,
-		                BSDFEval_coop,
-		                AOLightRay_coop,
-		                AOBSDF_coop,
-		                AOAlpha_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag);
-
-		kernel_set_args(program_sum_all_radiance(),
-		                0,
-		                d_data,
-		                d_buffer,
-		                per_sample_output_buffers,
-		                num_parallel_samples,
-		                d_w,
-		                d_h,
-		                d_stride,
-		                rtile.buffer_offset_x,
-		                rtile.buffer_offset_y,
-		                rtile.buffer_rng_state_stride,
-		                start_sample);
-
-		/* Macro for Enqueuing split kernels. */
-#define GLUE(a, b) a ## b
-#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
-		{ \
-			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
-			                               GLUE(program_, \
-			                                    kernelName)(), \
-			                               2, \
-			                               NULL, \
-			                               globalSize, \
-			                               localSize, \
-			                               0, \
-			                               NULL, \
-			                               NULL); \
-			opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
-			if(ciErr != CL_SUCCESS) { \
-				string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
-				                               clewErrorString(ciErr)); \
-				opencl_error(message); \
-				return; \
-			} \
-		} (void) 0
+			                rtile.num_samples,
+			                rtile.buffer);
 
 		/* Enqueue ckPathTraceKernel_data_init kernel. */
-		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
-		bool activeRaysAvailable = true;
-
-		/* Record number of time host intervention has been made */
-		unsigned int numHostIntervention = 0;
-		unsigned int numNextPathIterTimes = PathIteration_times;
-		bool canceled = false;
-		while(activeRaysAvailable) {
-			/* Twice the global work size of other kernels for
-			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
-			size_t global_size_shadow_blocked[2];
-			global_size_shadow_blocked[0] = global_size[0] * 2;
-			global_size_shadow_blocked[1] = global_size[1];
-
-			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
-			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
-				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
-				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-
-				if(task->get_cancel()) {
-					canceled = true;
-					break;
-				}
-			}
-
-			/* Read ray-state into Host memory to decide if we should exit
-			 * path-iteration in host.
-			 */
-			ciErr = clEnqueueReadBuffer(cqCommandQueue,
-			                            ray_state,
-			                            CL_TRUE,
-			                            0,
-			                            global_size[0] * global_size[1] * sizeof(char),
-			                            hostRayStateArray,
-			                            0,
-			                            NULL,
-			                            NULL);
-			assert(ciErr == CL_SUCCESS);
-
-			activeRaysAvailable = false;
-
-			for(int rayStateIter = 0;
-			    rayStateIter < global_size[0] * global_size[1];
-			    ++rayStateIter)
-			{
-				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
-					/* Not all rays are RAY_INACTIVE. */
-					activeRaysAvailable = true;
-					break;
-				}
-			}
-
-			if(activeRaysAvailable) {
-				numHostIntervention++;
-				PathIteration_times = PATH_ITER_INC_FACTOR;
-				/* Host intervention done before all rays become RAY_INACTIVE;
-				 * Set do more initial iterations for the next tile.
-				 */
-				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
-			}
-
-			if(task->get_cancel()) {
-				canceled = true;
-				break;
-			}
-		}
-
-		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
-		 * per_sample_output_buffers into RenderTile's output buffer.
-		 */
-		if(!canceled) {
-			size_t sum_all_radiance_local_size[2] = {16, 16};
-			size_t sum_all_radiance_global_size[2];
-			sum_all_radiance_global_size[0] =
-				(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
-				sum_all_radiance_local_size[0];
-			sum_all_radiance_global_size[1] =
-				(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
-				sum_all_radiance_local_size[1];
-			ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
-			                     sum_all_radiance_global_size,
-			                     sum_all_radiance_local_size);
-		}
-
-#undef ENQUEUE_SPLIT_KERNEL
-#undef GLUE
-
-		if(numHostIntervention == 0) {
-			/* This means that we are executing kernel more than required
-			 * Must avoid this for the next sample/tile.
-			 */
-			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
-			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
-		}
-		else {
-			/* Number of path-iterations done for this tile is set as
-			 * Initial path-iteration times for the next tile
-			 */
-			PathIteration_times = numNextPathIterTimes;
-		}
-
-		first_tile = false;
-	}
-
-	/* Calculates the amount of memory that has to be always
-	 * allocated in order for the split kernel to function.
-	 * This memory is tile/scene-property invariant (meaning,
-	 * the value returned by this function does not depend
-	 * on the user set tile size or scene properties.
-	 */
-	size_t get_invariable_mem_allocated()
-	{
-		size_t total_invariable_mem_allocated = 0;
-		size_t KernelGlobals_size = 0;
-
-		KernelGlobals_size = get_KernelGlobals_size();
-
-		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
-		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
-		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
-
-		return total_invariable_mem_allocated;
-	}
-
-	/* Calculate the memory that has-to-be/has-been allocated for
-	 * the split kernel to function.
-	 */
-	size_t get_tile_specific_mem_allocated(const int2 tile_size)
-	{
-		size_t tile_specific_mem_allocated = 0;
-
-		/* Get required tile info */
-		unsigned int user_set_tile_w = tile_size.x;
-		unsigned int user_set_tile_h = tile_size.y;
-
-#ifdef __WORK_STEALING__
-		/* Calculate memory to be allocated for work_pools in
-		 * case of work_stealing.
-		 */
-		size_t max_global_size[2];
-		size_t max_num_work_pools = 0;
-		max_global_size[0] =
-			(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		max_global_size[1] =
-			(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		max_num_work_pools =
-			(max_global_size[0] * max_global_size[1]) /
-			(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
-		tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
-#endif
-
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * sizeof(RNG);
-
-		return tile_specific_mem_allocated;
-	}
-
-	/* Calculates the texture memories and KernelData (d_data) memory
-	 * that has been allocated.
-	 */
-	size_t get_scene_specific_mem_allocated(cl_mem d_data)
-	{
-		size_t scene_specific_mem_allocated = 0;
-		/* Calculate texture memories. */
-#define KERNEL_TEX(type, ttype, name) \
-	scene_specific_mem_allocated += get_tex_size(#name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-		size_t d_data_size;
-		ciErr = clGetMemObjectInfo(d_data,
-		                           CL_MEM_SIZE,
-		                           sizeof(d_data_size),
-		                           &d_data_size,
-		                           NULL);
-		assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
-		scene_specific_mem_allocated += d_data_size;
-		return scene_specific_mem_allocated;
-	}
-
-	/* Calculate the memory required for one thread in split kernel. */
-	size_t get_per_thread_memory()
-	{
-		size_t shaderdata_size = 0;
-		/* TODO(sergey): This will actually over-allocate if
-		 * particular kernel does not support multiclosure.
-		 */
-		shaderdata_size = get_shader_data_size(current_max_closure);
-		size_t retval = sizeof(RNG)
-			+ sizeof(float3)          /* Throughput size */
-			+ sizeof(float)           /* L transparent size */
-			+ sizeof(char)            /* Ray state size */
-			+ sizeof(unsigned int)    /* Work element size */
-			+ sizeof(int)             /* ISLamp_size */
-			+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
-			+ sizeof(Intersection)    /* Overall isect */
-			+ sizeof(Intersection)    /* Instersection_coop_AO */
-			+ sizeof(Intersection)    /* Intersection coop DL */
-			+ shaderdata_size         /* Overall ShaderData */
-			+ (shaderdata_size * 2)   /* ShaderData : DL and shadow */
-			+ sizeof(Ray) + sizeof(BsdfEval)
-			+ sizeof(float3)          /* AOAlpha size */
-			+ sizeof(float3)          /* AOBSDF size */
-			+ sizeof(Ray)
-			+ (sizeof(int) * NUM_QUEUES)
-			+ per_thread_output_buffer_size;
-		return retval;
-	}
-
-	/* Considers the total memory available in the device and
-	 * and returns the maximum global work size possible.
-	 */
-	size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
-	{
-		/* Calculate invariably allocated memory. */
-		size_t invariable_mem_allocated = get_invariable_mem_allocated();
-		/* Calculate tile specific allocated memory. */
-		size_t tile_specific_mem_allocated =
-			get_tile_specific_mem_allocated(tile_size);
-		/* Calculate scene specific allocated memory. */
-		size_t scene_specific_mem_allocated =
-			get_scene_specific_mem_allocated(d_data);
-		/* Calculate total memory available for the threads in global work size. */
-		size_t available_memory = total_allocatable_memory
-			- invariable_mem_allocated
-			- tile_specific_mem_allocated
-			- scene_specific_mem_allocated
-			- DATA_ALLOCATION_MEM_FACTOR;
-		size_t per_thread_memory_required = get_per_thread_memory();
-		return (available_memory / per_thread_memory_required);
-	}
-
-	/* Checks if the device has enough memory to render the whole tile;
-	 * If not, we should split single tile into multiple tiles of small size
-	 * and process them all.
-	 */
-	bool need_to_split_tile(unsigned int d_w,
-	                        unsigned int d_h,
-	                        int2 max_render_feasible_tile_size)
-	{
-		size_t global_size_estimate[2];
-		/* TODO(sergey): Such round-ups are in quite few places, need to replace
-		 * them with an utility macro.
-		 */
-		global_size_estimate[0] =
-			(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		global_size_estimate[1] =
-			(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if((global_size_estimate[0] * global_size_estimate[1]) >
-		   (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
-		{
-			return true;
-		}
-		else {
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_data_init(),
+		                               2,
+		                               NULL,
+		                               dim.global_size,
+		                               dim.local_size,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
 			return false;
 		}
-	}
 
-	/* Considers the scene properties, global memory available in the device
-	 * and returns a rectanglular tile dimension (approx the maximum)
-	 * that should render on split kernel.
-	 */
-	int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
-	{
-		int2 max_render_feasible_tile_size;
-		int square_root_val = (int)sqrt(feasible_global_work_size);
-		max_render_feasible_tile_size.x = square_root_val;
-		max_render_feasible_tile_size.y = square_root_val;
-		/* Ciel round-off max_render_feasible_tile_size. */
-		int2 ceil_render_feasible_tile_size;
-		ceil_render_feasible_tile_size.x =
-			(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		ceil_render_feasible_tile_size.y =
-			(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
-		   feasible_global_work_size)
-		{
-			return ceil_render_feasible_tile_size;
-		}
-		/* Floor round-off max_render_feasible_tile_size. */
-		int2 floor_render_feasible_tile_size;
-		floor_render_feasible_tile_size.x =
-			(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		floor_render_feasible_tile_size.y =
-			(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		return floor_render_feasible_tile_size;
+		return true;
 	}
 
-	/* Try splitting the current tile into multiple smaller
-	 * almost-square-tiles.
-	 */
-	int2 get_split_tile_size(RenderTile rtile,
-	                         int2 max_render_feasible_tile_size)
+	virtual int2 split_kernel_local_size()
 	{
-		int2 split_tile_size;
-		int num_global_threads = max_render_feasible_tile_size.x *
-		                         max_render_feasible_tile_size.y;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		/* Ceil round off d_w and d_h */
-		d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		while(d_w * d_h > num_global_threads) {
-			/* Halve the longer dimension. */
-			if(d_w >= d_h) {
-				d_w = d_w / 2;
-				d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_X;
-			}
-			else {
-				d_h = d_h / 2;
-				d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_Y;
-			}
-		}
-		split_tile_size.x = d_w;
-		split_tile_size.y = d_h;
-		return split_tile_size;
+		return make_int2(64, 1);
 	}
 
-	/* Splits existing tile into multiple tiles of tile size split_tile_size. */
-	vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
 	{
-		vector<SplitRenderTile> to_path_trace_rtile;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
-		int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
-		/* Buffer and rng_state offset calc. */
-		size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
-		size_t offset_x = offset_index % rtile.stride;
-		size_t offset_y = offset_index / rtile.stride;
-		/* Resize to_path_trace_rtile. */
-		to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
-		for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
-			for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
-				int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
-				to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
-				to_path_trace_rtile[rtile_index].sample = rtile.sample;
-				to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
-				to_path_trace_rtile[rtile_index].offset = rtile.offset;
-				to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
-				to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
-				to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
-				to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
-				to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
-				to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
-				/* Fill width and height of the new render tile. */
-				to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
-					(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
-					: split_tile_size.x;
-				to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
-					(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
-					: split_tile_size.y;
-				to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
-			}
+		cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
+		/* Use small global size on CPU devices as it seems to be much faster. */
+		if(type == CL_DEVICE_TYPE_CPU) {
+			VLOG(1) << "Global size: (64, 64).";
+			return make_int2(64, 64);
 		}
-		return to_path_trace_rtile;
-	}
 
-	void thread_run(DeviceTask *task)
-	{
-		if(task->type == DeviceTask::FILM_CONVERT) {
-			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-		}
-		else if(task->type == DeviceTask::SHADER) {
-			shader(*task);
-		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
-			RenderTile tile;
-			bool initialize_data_and_check_render_feasibility = false;
-			bool need_to_split_tiles_further = false;
-			int2 max_render_feasible_tile_size;
-			size_t feasible_global_work_size;
-			const int2 tile_size = task->requested_tile_size;
-			/* Keep rendering tiles until done. */
-			while(task->acquire_tile(this, tile)) {
-				if(!initialize_data_and_check_render_feasibility) {
-					/* Initialize data. */
-					/* Calculate per_thread_output_buffer_size. */
-					size_t output_buffer_size = 0;
-					ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
-					                           CL_MEM_SIZE,
-					                           sizeof(output_buffer_size),
-					                           &output_buffer_size,
-					                           NULL);
-					assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
-					/* This value is different when running on AMD and NV. */
-					if(background) {
-						/* In offline render the number of buffer elements
-						 * associated with tile.buffer is the current tile size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.w * tile.h);
-					}
-					else {
-						/* interactive rendering, unlike offline render, the number of buffer elements
-						 * associated with tile.buffer is the entire viewport size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.buffers->params.width *
-							                      tile.buffers->params.height);
-					}
-					/* Check render feasibility. */
-					feasible_global_work_size = get_feasible_global_work_size(
-						tile_size,
-						CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
-					max_render_feasible_tile_size =
-						get_max_render_feasible_tile_size(
-							feasible_global_work_size);
-					need_to_split_tiles_further =
-						need_to_split_tile(tile_size.x,
-						                   tile_size.y,
-						                   max_render_feasible_tile_size);
-					initialize_data_and_check_render_feasibility = true;
-				}
-				if(need_to_split_tiles_further) {
-					int2 split_tile_size =
-						get_split_tile_size(tile,
-						                    max_render_feasible_tile_size);
-					vector<SplitRenderTile> to_path_trace_render_tiles =
-						split_tiles(tile, split_tile_size);
-					/* Print message to console */
-					if(background && (to_path_trace_render_tiles.size() > 1)) {
-						fprintf(stderr, "Message : Tiles need to be split "
-						        "further inside path trace (due to insufficient "
-						        "device-global-memory for split kernel to "
-						        "function) \n"
-						        "The current tile of dimensions %dx%d is split "
-						        "into tiles of dimension %dx%d for render \n",
-						        tile.w, tile.h,
-						        split_tile_size.x,
-						        split_tile_size.y);
-					}
-					/* Process all split tiles. */
-					for(int tile_iter = 0;
-					    tile_iter < to_path_trace_render_tiles.size();
-					    ++tile_iter)
-					{
-						path_trace(task,
-						           to_path_trace_render_tiles[tile_iter],
-						           max_render_feasible_tile_size);
-					}
-				}
-				else {
-					/* No splitting required; process the entire tile at once. */
-					/* Render feasible tile size is user-set-tile-size itself. */
-					max_render_feasible_tile_size.x =
-						(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_X;
-					max_render_feasible_tile_size.y =
-						(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_Y;
-					/* buffer_rng_state_stride is stride itself. */
-					SplitRenderTile split_tile(tile);
-					split_tile.buffer_rng_state_stride = tile.stride;
-					path_trace(task, split_tile, max_render_feasible_tile_size);
-				}
-				tile.sample = tile.start_sample + tile.num_samples;
+		cl_ulong max_buffer_size;
+		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+		VLOG(1) << "Maximum device allocation size: "
+		        << string_human_readable_number(max_buffer_size) << " bytes. ("
+		        << string_human_readable_size(max_buffer_size) << ").";
 
-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-
-				task->release_tile(tile);
-			}
-		}
-	}
-
-protected:
-	cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
-	{
-		cl_mem ptr;
-		assert(bufsize != 0);
-		ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
-		opencl_assert_err(ciErr, "clCreateBuffer");
-		return ptr;
+		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
+		int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
+		VLOG(1) << "Global size: " << global_size << ".";
+		return global_size;
 	}
+};
 
-	/* ** Those guys are for workign around some compiler-specific bugs ** */
+OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+: OpenCLDeviceBase(info, stats, background_)
+{
+	split_kernel = new OpenCLSplitKernel(this);
 
-	string build_options_for_base_program(
-	        const DeviceRequestedFeatures& requested_features)
-	{
-		return requested_features.get_build_options();
-	}
-};
+	background = background_;
+}
 
 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
 {
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 82e1640e508..8128fcee09b 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,11 +16,12 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 using std::cerr;
 using std::endl;
@@ -234,7 +235,7 @@ string OpenCLCache::get_kernel_md5()
 	thread_scoped_lock lock(self.kernel_md5_lock);
 
 	if(self.kernel_md5.empty()) {
-		self.kernel_md5 = path_files_md5_hash(path_get("kernel"));
+		self.kernel_md5 = path_files_md5_hash(path_get("source"));
 	}
 	return self.kernel_md5;
 }
@@ -309,6 +310,8 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 	string build_options;
 	build_options = device->kernel_build_options(debug_src) + kernel_build_options;
 
+	VLOG(1) << "Build options passed to clBuildProgram: '"
+	        << build_options << "'.";
 	cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
 
 	/* show warnings even if build is successful */
@@ -336,12 +339,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 
 bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 {
-	string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
+	string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
 	/* We compile kernels consisting of many files. unfortunately OpenCL
 	 * kernel caches do not seem to recognize changes in included files.
 	 * so we force recompile on changes by adding the md5 hash of all files.
 	 */
-	source = path_source_replace_includes(source, path_get("kernel"));
+	source = path_source_replace_includes(source, path_get("source"));
+	source += "\n// " + util_md5_string(source) + "\n";
 
 	if(debug_src) {
 		path_write_text(*debug_src, source);
@@ -352,10 +356,10 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 	cl_int ciErr;
 
 	program = clCreateProgramWithSource(device->cxContext,
-	                                   1,
-	                                   &source_str,
-	                                   &source_len,
-	                                   &ciErr);
+	                                    1,
+	                                    &source_str,
+	                                    &source_len,
+	                                    &ciErr);
 
 	if(ciErr != CL_SUCCESS) {
 		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
@@ -438,7 +442,11 @@ void OpenCLDeviceBase::OpenCLProgram::load()
 	if(!program) {
 		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
 
-		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
+		/* need to create source to get md5 */
+		string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
+		source = path_source_replace_includes(source, path_get("source"));
+
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
 		basename = path_cache_get(path_join("kernels", basename));
 		string clbin = basename + ".clbin";
 
@@ -544,6 +552,11 @@ bool OpenCLInfo::use_debug()
 	return DebugFlags().opencl.debug;
 }
 
+bool OpenCLInfo::use_single_program()
+{
+	return DebugFlags().opencl.single_program;
+}
+
 bool OpenCLInfo::kernel_use_advanced_shading(const string& platform)
 {
 	/* keep this in sync with kernel_types.h! */
@@ -587,11 +600,20 @@ bool OpenCLInfo::device_supported(const string& platform_name,
                                   const cl_device_id device_id)
 {
 	cl_device_type device_type;
-	clGetDeviceInfo(device_id,
-	                CL_DEVICE_TYPE,
-	                sizeof(cl_device_type),
-	                &device_type,
-	                NULL);
+	if(!get_device_type(device_id, &device_type)) {
+		return false;
+	}
+	string device_name;
+	if(!get_device_name(device_id, &device_name)) {
+		return false;
+	}
+	/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
+	 * (aka, it will not be on Intel framework). This isn't supported
+	 * and needs an explicit blacklist.
+	 */
+	if(strstr(device_name.c_str(), "Iris")) {
+		return false;
+	}
 	if(platform_name == "AMD Accelerated Parallel Processing" &&
 	   device_type == CL_DEVICE_TYPE_GPU)
 	{
@@ -705,39 +727,30 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 		return;
 	}
 
+	cl_int error;
 	vector<cl_device_id> device_ids;
-	cl_uint num_devices = 0;
 	vector<cl_platform_id> platform_ids;
-	cl_uint num_platforms = 0;
 
-	/* Get devices. */
-	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
-	   num_platforms == 0)
-	{
-		FIRST_VLOG(2) << "No OpenCL platforms were found.";
+	/* Get platforms. */
+	if(!get_platforms(&platform_ids, &error)) {
+		FIRST_VLOG(2) << "Error fetching platforms:"
+		              << string(clewErrorString(error));
 		first_time = false;
 		return;
 	}
-	platform_ids.resize(num_platforms);
-	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
-		FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
+	if(platform_ids.size() == 0) {
+		FIRST_VLOG(2) << "No OpenCL platforms were found.";
 		first_time = false;
 		return;
 	}
 	/* Devices are numbered consecutively across platforms. */
-	for(int platform = 0; platform < num_platforms; platform++) {
+	for(int platform = 0; platform < platform_ids.size(); platform++) {
 		cl_platform_id platform_id = platform_ids[platform];
-		char pname[256];
-		if(clGetPlatformInfo(platform_id,
-		                     CL_PLATFORM_NAME,
-		                     sizeof(pname),
-		                     &pname,
-		                     NULL) != CL_SUCCESS)
-		{
+		string platform_name;
+		if(!get_platform_name(platform_id, &platform_name)) {
 			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
 			continue;
 		}
-		string platform_name = pname;
 		FIRST_VLOG(2) << "Enumerating devices for platform "
 		              << platform_name << ".";
 		if(!platform_version_check(platform_id)) {
@@ -745,39 +758,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 			              << " due to too old compiler version.";
 			continue;
 		}
-		num_devices = 0;
-		cl_int ciErr;
-		if((ciErr = clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  0,
-		                  NULL,
-		                  &num_devices)) != CL_SUCCESS || num_devices == 0)
+		if(!get_platform_devices(platform_id,
+		                         device_type,
+		                         &device_ids,
+		                         &error))
 		{
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch number of devices: " << string(clewErrorString(ciErr));
+			              << ", failed to fetch of devices: "
+			              << string(clewErrorString(error));
 			continue;
 		}
-		device_ids.resize(num_devices);
-		if(clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  num_devices,
-		                  &device_ids[0],
-		                  NULL) != CL_SUCCESS)
-		{
+		if(device_ids.size() == 0) {
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch devices list.";
+			              << ", it has no devices.";
 			continue;
 		}
-		for(int num = 0; num < num_devices; num++) {
-			cl_device_id device_id = device_ids[num];
-			char device_name[1024] = "\0";
-			if(clGetDeviceInfo(device_id,
-			                   CL_DEVICE_NAME,
-			                   sizeof(device_name),
-			                   &device_name,
-			                   NULL) != CL_SUCCESS)
-			{
-				FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
+		for(int num = 0; num < device_ids.size(); num++) {
+			const cl_device_id device_id = device_ids[num];
+			string device_name;
+			if(!get_device_name(device_id, &device_name, &error)) {
+				FIRST_VLOG(2) << "Failed to fetch device name: "
+				              << string(clewErrorString(error))
+				              << ", ignoring.";
 				continue;
 			}
 			if(!device_version_check(device_id)) {
@@ -789,24 +791,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 			   device_supported(platform_name, device_id))
 			{
 				cl_device_type device_type;
-				if(clGetDeviceInfo(device_id,
-				                   CL_DEVICE_TYPE,
-				                   sizeof(cl_device_type),
-				                   &device_type,
-				                   NULL) != CL_SUCCESS)
-				{
+				if(!get_device_type(device_id, &device_type, &error)) {
 					FIRST_VLOG(2) << "Ignoring device " << device_name
-					              << ", failed to fetch device type.";
+					              << ", failed to fetch device type:"
+					              << string(clewErrorString(error));
 					continue;
 				}
-				FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+				string readable_device_name =
+				        get_readable_device_name(device_id);
+				if(readable_device_name != device_name) {
+					FIRST_VLOG(2) << "Using more readable device name: "
+					              << readable_device_name;
+				}
+				FIRST_VLOG(2) << "Adding new device "
+				              << readable_device_name << ".";
 				string hardware_id = get_hardware_id(platform_name, device_id);
-				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-				                                               platform_name,
-				                                               device_id,
-				                                               device_type,
-				                                               device_name,
-				                                               hardware_id));
+				usable_devices->push_back(OpenCLPlatformDevice(
+				        platform_id,
+				        platform_name,
+				        device_id,
+				        device_type,
+				        readable_device_name,
+				        hardware_id));
 			}
 			else {
 				FIRST_VLOG(2) << "Ignoring device " << device_name
@@ -817,6 +823,252 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 	first_time = false;
 }
 
+bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids,
+                               cl_int *error)
+{
+	/* Reset from possible previous state. */
+	platform_ids->resize(0);
+	cl_uint num_platforms;
+	if(!get_num_platforms(&num_platforms, error)) {
+		return false;
+	}
+	/* Get actual platforms. */
+	cl_int err;
+	platform_ids->resize(num_platforms);
+	if((err = clGetPlatformIDs(num_platforms,
+	                           &platform_ids->at(0),
+	                           NULL)) != CL_SUCCESS) {
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+vector<cl_platform_id> OpenCLInfo::get_platforms()
+{
+	vector<cl_platform_id> platform_ids;
+	get_platforms(&platform_ids);
+	return platform_ids;
+}
+
+bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
+{
+	cl_int err;
+	if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
+		if(error != NULL) {
+			*error = err;
+		}
+		*num_platforms = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_uint OpenCLInfo::get_num_platforms()
+{
+	cl_uint num_platforms;
+	if(!get_num_platforms(&num_platforms)) {
+		return 0;
+	}
+	return num_platforms;
+}
+
+bool OpenCLInfo::get_platform_name(cl_platform_id platform_id,
+                                   string *platform_name)
+{
+	char buffer[256];
+	if(clGetPlatformInfo(platform_id,
+	                     CL_PLATFORM_NAME,
+	                     sizeof(buffer),
+	                     &buffer,
+	                     NULL) != CL_SUCCESS)
+	{
+		*platform_name = "";
+		return false;
+	}
+	*platform_name = buffer;
+	return true;
+}
+
+string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
+{
+	string platform_name;
+	if (!get_platform_name(platform_id, &platform_name)) {
+		return "";
+	}
+	return platform_name;
+}
+
+bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
+                                          cl_device_type device_type,
+                                          cl_uint *num_devices,
+                                          cl_int *error)
+{
+	cl_int err;
+	if((err = clGetDeviceIDs(platform_id,
+	                         device_type,
+	                         0,
+	                         NULL,
+	                         num_devices)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*num_devices = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
+                                             cl_device_type device_type)
+{
+	cl_uint num_devices;
+	if(!get_num_platform_devices(platform_id,
+	                             device_type,
+	                             &num_devices))
+	{
+		return 0;
+	}
+	return num_devices;
+}
+
+bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
+                                      cl_device_type device_type,
+                                      vector<cl_device_id> *device_ids,
+                                      cl_int* error)
+{
+	/* Reset from possible previous state. */
+	device_ids->resize(0);
+	/* Get number of devices to pre-allocate memory. */
+	cl_uint num_devices;
+	if(!get_num_platform_devices(platform_id,
+	                             device_type,
+	                             &num_devices,
+	                             error))
+	{
+		return false;
+	}
+	/* Get actual device list. */
+	device_ids->resize(num_devices);
+	cl_int err;
+	if((err = clGetDeviceIDs(platform_id,
+	                         device_type,
+	                         num_devices,
+	                         &device_ids->at(0),
+	                         NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
+                                                      cl_device_type device_type)
+{
+	vector<cl_device_id> devices;
+	get_platform_devices(platform_id, device_type, &devices);
+	return devices;
+}
+
+bool OpenCLInfo::get_device_name(cl_device_id device_id,
+                                 string *device_name,
+                                 cl_int* error)
+{
+	char buffer[1024];
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DEVICE_NAME,
+	                          sizeof(buffer),
+	                          &buffer,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*device_name = "";
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	*device_name = buffer;
+	return true;
+}
+
+string OpenCLInfo::get_device_name(cl_device_id device_id)
+{
+	string device_name;
+	if(!get_device_name(device_id, &device_name)) {
+		return "";
+	}
+	return device_name;
+}
+
+bool OpenCLInfo::get_device_type(cl_device_id device_id,
+                                 cl_device_type *device_type,
+                                 cl_int* error)
+{
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DEVICE_TYPE,
+	                          sizeof(cl_device_type),
+	                          device_type,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*device_type = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
+{
+	cl_device_type device_type;
+	if(!get_device_type(device_id, &device_type)) {
+		return 0;
+	}
+	return device_type;
+}
+
+string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
+{
+	char board_name[1024];
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_BOARD_NAME_AMD,
+	                   sizeof(board_name),
+	                   &board_name,
+	                   NULL) == CL_SUCCESS)
+	{
+		return board_name;
+	}
+	/* Fallback to standard device name API. */
+	return get_device_name(device_id);
+}
+
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/graph/CMakeLists.txt b/intern/cycles/graph/CMakeLists.txt
index 4ea18728f1c..e70a18137bd 100644
--- a/intern/cycles/graph/CMakeLists.txt
+++ b/intern/cycles/graph/CMakeLists.txt
@@ -1,7 +1,6 @@
 
 set(INC
-	.
-	../util
+	..
 )
 
 set(SRC
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index 3c228a716d5..10d91a1e4ef 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "node.h"
-#include "node_type.h"
+#include "graph/node.h"
+#include "graph/node_type.h"
 
-#include "util_foreach.h"
-#include "util_param.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index 64410f4539b..53425f5faf1 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "node_type.h"
+#include "graph/node_type.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_enum.h b/intern/cycles/graph/node_enum.h
index 2bae531c036..4e40c294f4f 100644
--- a/intern/cycles/graph/node_enum.h
+++ b/intern/cycles/graph/node_enum.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "util_map.h"
-#include "util_param.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp
index 5b98de778ad..a3a8fa5f382 100644
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "node_type.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "graph/node_type.h"
+#include "util/util_foreach.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h
index 1fb135f6d22..7d46e31ce24 100644
--- a/intern/cycles/graph/node_type.h
+++ b/intern/cycles/graph/node_type.h
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include "node_enum.h"
+#include "graph/node_enum.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_xml.cpp b/intern/cycles/graph/node_xml.cpp
index 590e09645ed..aad2740ffc0 100644
--- a/intern/cycles/graph/node_xml.cpp
+++ b/intern/cycles/graph/node_xml.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "node_xml.h"
+#include "graph/node_xml.h"
 
-#include "util_foreach.h"
-#include "util_string.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_xml.h b/intern/cycles/graph/node_xml.h
index 7494c5e6e55..63e80bf79f2 100644
--- a/intern/cycles/graph/node_xml.h
+++ b/intern/cycles/graph/node_xml.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_map.h"
-#include "util_string.h"
-#include "util_xml.h"
+#include "util/util_map.h"
+#include "util/util_string.h"
+#include "util/util_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 5f3ceb0f864..c3772dfa2d8 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -1,10 +1,7 @@
 remove_extra_strict_flags()
 
 set(INC
-	.
-	../util
-	osl
-	svm
+	..
 )
 
 set(INC_SYS
@@ -13,19 +10,28 @@ set(INC_SYS
 
 set(SRC
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_split.cpp
 	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_state_buffer_size.cl
+	kernels/opencl/kernel_split.cl
 	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_path_init.cl
 	kernels/opencl/kernel_queue_enqueue.cl
 	kernels/opencl/kernel_scene_intersect.cl
 	kernels/opencl/kernel_lamp_emission.cl
-	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_do_volume.cl
+	kernels/opencl/kernel_indirect_background.cl
 	kernels/opencl/kernel_shader_eval.cl
 	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_subsurface_scatter.cl
 	kernels/opencl/kernel_direct_lighting.cl
-	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_shadow_blocked_ao.cl
+	kernels/opencl/kernel_shadow_blocked_dl.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
-	kernels/opencl/kernel_sum_all_radiance.cl
+	kernels/opencl/kernel_indirect_subsurface.cl
+	kernels/opencl/kernel_buffer_update.cl
 	kernels/cuda/kernel.cu
+	kernels/cuda/kernel_split.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -68,6 +74,7 @@ set(SRC_HEADERS
 	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
+	kernel_path_subsurface.h
 	kernel_path_volume.h
 	kernel_projection.h
 	kernel_queues.h
@@ -88,6 +95,10 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu_image.h
 )
 
+set(SRC_KERNELS_CUDA_HEADERS
+	kernels/cuda/kernel_config.h
+)
+
 set(SRC_CLOSURE_HEADERS
 	closure/alloc.h
 	closure/bsdf.h
@@ -184,6 +195,7 @@ set(SRC_UTIL_HEADERS
 	../util/util_hash.h
 	../util/util_math.h
 	../util/util_math_fast.h
+	../util/util_math_intersect.h
 	../util/util_static_assert.h
 	../util/util_transform.h
 	../util/util_texture.h
@@ -191,17 +203,25 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
-	split/kernel_background_buffer_update.h
+	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
+	split/kernel_do_volume.h
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_indirect_background.h
+	split/kernel_indirect_subsurface.h
 	split/kernel_lamp_emission.h
 	split/kernel_next_iteration_setup.h
+	split/kernel_path_init.h
+	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
 	split/kernel_shader_eval.h
-	split/kernel_shadow_blocked.h
+	split/kernel_shadow_blocked_ao.h
+	split/kernel_shadow_blocked_dl.h
 	split/kernel_split_common.h
-	split/kernel_sum_all_radiance.h
+	split/kernel_split_data.h
+	split/kernel_split_data_types.h
+	split/kernel_subsurface_scatter.h
 )
 
 # CUDA module
@@ -229,8 +249,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu
+	set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
 		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
 		${SRC_BVH_HEADERS}
 		${SRC_SVM_HEADERS}
 		${SRC_GEOM_HEADERS}
@@ -239,15 +260,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	)
 	set(cuda_cubins)
 
-	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
-		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
-			set(cuda_cubin kernel_experimental_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
+		if(${split})
+			set(cuda_extra_flags "-D__SPLIT__")
+			set(cuda_cubin kernel_split)
 		else()
 			set(cuda_extra_flags "")
-			set(cuda_cubin kernel_${arch}.cubin)
+			set(cuda_cubin kernel)
 		endif()
 
+		if(${experimental})
+			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
+			set(cuda_cubin ${cuda_cubin}_experimental)
+		endif()
+
+		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
 		else()
@@ -260,13 +288,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")
 
+		if(split)
+			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
+		else()
+			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
+		endif()
+
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
 			COMMAND ${cuda_nvcc_command}
 					-arch=${arch}
 					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
@@ -274,8 +308,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 					${cuda_math_flags}
 					${cuda_extra_flags}
 					${cuda_debug_flags}
-					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
-					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
+					-I${CMAKE_CURRENT_SOURCE_DIR}/..
 					-DCCL_NAMESPACE_BEGIN=
 					-DCCL_NAMESPACE_END=
 					-DNVCC
@@ -293,7 +326,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+
+		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+			# Compile split kernel
+			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+		endif()
 	endforeach()
 
 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -311,36 +349,50 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
+set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+
 if(CXX_HAS_SSE)
 	list(APPEND SRC
 		kernels/cpu/kernel_sse2.cpp
 		kernels/cpu/kernel_sse3.cpp
 		kernels/cpu/kernel_sse41.cpp
+		kernels/cpu/kernel_split_sse2.cpp
+		kernels/cpu/kernel_split_sse3.cpp
+		kernels/cpu/kernel_split_sse41.cpp
 	)
 
 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx.cpp
+		kernels/cpu/kernel_split_avx.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx2.cpp
+		kernels/cpu/kernel_split_avx2.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel
 	${SRC}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_KERNELS_CUDA_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_SVM_HEADERS}
@@ -362,24 +414,33 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
 
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 36798982653..85741016b25 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -27,43 +27,43 @@
 
 CCL_NAMESPACE_BEGIN
 
-#include "bvh_types.h"
+#include "kernel/bvh/bvh_types.h"
 
 /* Common QBVH functions. */
 #ifdef __QBVH__
-#  include "qbvh_nodes.h"
+#  include "kernel/bvh/qbvh_nodes.h"
 #endif
 
 /* Regular BVH traversal */
 
-#include "bvh_nodes.h"
+#include "kernel/bvh/bvh_nodes.h"
 
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
-#include "bvh_traversal.h"
+#include "kernel/bvh/bvh_traversal.h"
 
 #if defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_instancing
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 /* Subsurface scattering BVH traversal */
@@ -71,12 +71,12 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SUBSURFACE__)
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_subsurface.h"
+#  include "kernel/bvh/bvh_subsurface.h"
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
 #    define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
-#    include "bvh_subsurface.h"
+#    include "kernel/bvh/bvh_subsurface.h"
 #  endif
 #endif  /* __SUBSURFACE__ */
 
@@ -85,18 +85,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume.h"
+#  include "kernel/bvh/bvh_volume.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 #endif  /* __VOLUME__ */
 
@@ -105,30 +105,30 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SHADOW_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #  define BVH_FUNCTION_FEATURES 0
-#  include "bvh_shadow_all.h"
+#  include "kernel/bvh/bvh_shadow_all.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__HAIR__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 #endif  /* __SHADOW_RECORD_ALL__ */
 
@@ -137,18 +137,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume_all.h"
+#  include "kernel/bvh/bvh_volume_all.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 #endif  /* __VOLUME_RECORD_ALL__ */
 
@@ -202,8 +202,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 }
 
 #ifdef __SUBSURFACE__
+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
-                                                     const Ray *ray,
+                                                     const Ray ray,
                                                      SubsurfaceIntersection *ss_isect,
                                                      int subsurface_object,
                                                      uint *lcg_state,
@@ -212,7 +213,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_subsurface_motion(kg,
-		                                       ray,
+		                                       &ray,
 		                                       ss_isect,
 		                                       subsurface_object,
 		                                       lcg_state,
@@ -220,7 +221,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 	}
 #endif /* __OBJECT_MOTION__ */
 	return bvh_intersect_subsurface(kg,
-	                                ray,
+	                                &ray,
 	                                ss_isect,
 	                                subsurface_object,
 	                                lcg_state,
@@ -229,30 +230,63 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 #endif
 
 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     int skip_object,
+                                                     uint max_hits,
+                                                     uint *num_hits)
 {
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #    ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+		if(kernel_data.bvh.have_curves) {
+			return bvh_intersect_shadow_all_hair_motion(kg,
+			                                            ray,
+			                                            isect,
+			                                            skip_object,
+			                                            max_hits,
+			                                            num_hits);
+		}
 #    endif /* __HAIR__ */
 
-		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+		return bvh_intersect_shadow_all_motion(kg,
+		                                       ray,
+		                                       isect,
+		                                       skip_object,
+		                                       max_hits,
+		                                       num_hits);
 	}
 #  endif /* __OBJECT_MOTION__ */
 
 #  ifdef __HAIR__
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_curves) {
+		return bvh_intersect_shadow_all_hair(kg,
+		                                     ray,
+		                                     isect,
+		                                     skip_object,
+		                                     max_hits,
+		                                     num_hits);
+	}
 #  endif /* __HAIR__ */
 
 #  ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_instancing) {
+		return bvh_intersect_shadow_all_instancing(kg,
+		                                           ray,
+		                                           isect,
+		                                           skip_object,
+		                                           max_hits,
+		                                           num_hits);
+	}
 #  endif /* __INSTANCING__ */
 
-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_shadow_all(kg,
+	                                ray,
+	                                isect,
+	                                skip_object,
+	                                max_hits,
+	                                num_hits);
 }
 #endif  /* __SHADOW_RECORD_ALL__ */
 
@@ -357,7 +391,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@@ -373,5 +407,28 @@ ccl_device int intersections_compare(const void *a, const void *b)
 }
 #endif
 
-CCL_NAMESPACE_END
+#if defined(__SHADOW_RECORD_ALL__)
+ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
+{
+#ifdef __KERNEL_GPU__
+	/* Use bubble sort which has more friendly memory pattern on GPU. */
+	bool swapped;
+	do {
+		swapped = false;
+		for(int j = 0; j < num_hits - 1; ++j) {
+			if(hits[j].t > hits[j + 1].t) {
+				struct Intersection tmp = hits[j];
+				hits[j] = hits[j + 1];
+				hits[j + 1] = tmp;
+				swapped = true;
+			}
+		}
+		--num_hits;
+	} while(swapped);
+#else
+	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+#endif
+}
+#endif  /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 726bef1794c..74a9ebf14e4 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -17,8 +17,8 @@
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
 ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
-                                                           int node_addr,
-                                                           int child)
+                                                                int node_addr,
+                                                                int child)
 {
 	Transform space;
 	const int child_addr = node_addr + child * 3;
@@ -31,12 +31,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
 
 #if !defined(__KERNEL_SSE2__)
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
-                                                 const float3 P,
-                                                 const float3 idir,
-                                                 const float t,
-                                                 const int node_addr,
-                                                 const uint visibility,
-                                                 float dist[2])
+                                                      const float3 P,
+                                                      const float3 idir,
+                                                      const float t,
+                                                      const int node_addr,
+                                                      const uint visibility,
+                                                      float dist[2])
 {
 
 	/* fetch node data */
@@ -78,14 +78,14 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
-                                                        const float3 P,
-                                                        const float3 idir,
-                                                        const float t,
-                                                        const float difl,
-                                                        const float extmax,
-                                                        const int node_addr,
-                                                        const uint visibility,
-                                                        float dist[2])
+                                                             const float3 P,
+                                                             const float3 idir,
+                                                             const float t,
+                                                             const float difl,
+                                                             const float extmax,
+                                                             const int node_addr,
+                                                             const uint visibility,
+                                                             float dist[2])
 {
 
 	/* fetch node data */
@@ -203,13 +203,13 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const float3 idir,
-                                                   const float t,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -233,15 +233,15 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const float3 idir,
-                                                          const float t,
-                                                          const float difl,
-                                                          const float extmax,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const float3 idir,
+                                                               const float t,
+                                                               const float difl,
+                                                               const float extmax,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -265,13 +265,13 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3 P,
-                                         const float3 dir,
-                                         const float3 idir,
-                                         const float t,
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3 P,
+                                              const float3 dir,
+                                              const float3 idir,
+                                              const float t,
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -296,15 +296,15 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3 P,
-                                                const float3 dir,
-                                                const float3 idir,
-                                                const float t,
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3 P,
+                                                     const float3 dir,
+                                                     const float3 idir,
+                                                     const float t,
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -442,19 +442,19 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const ssef& isect_near,
-                                                   const ssef& isect_far,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const ssef& isect_near,
+                                                        const ssef& isect_far,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
 
 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -503,20 +503,20 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const ssef& isect_near,
-                                                          const ssef& isect_far,
-                                                          const float difl,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const ssef& isect_near,
+                                                               const ssef& isect_far,
+                                                               const float difl,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
 
 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -574,17 +574,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3& P,
-                                         const float3& dir,
-                                         const ssef& isect_near,
-                                         const ssef& isect_far,
-                                         const ssef& tsplat,
-                                         const ssef Psplat[3],
-                                         const ssef idirsplat[3],
-                                         const shuffle_swap_t shufflexyz[3],
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3& P,
+                                              const float3& dir,
+                                              const ssef& isect_near,
+                                              const ssef& isect_far,
+                                              const ssef& tsplat,
+                                              const ssef Psplat[3],
+                                              const ssef idirsplat[3],
+                                              const shuffle_swap_t shufflexyz[3],
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -612,19 +612,19 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3& P,
-                                                const float3& dir,
-                                                const ssef& isect_near,
-                                                const ssef& isect_far,
-                                                const ssef& tsplat,
-                                                const ssef Psplat[3],
-                                                const ssef idirsplat[3],
-                                                const shuffle_swap_t shufflexyz[3],
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3& P,
+                                                     const float3& dir,
+                                                     const ssef& isect_near,
+                                                      const ssef& isect_far,
+                                                     const ssef& tsplat,
+                                                     const ssef Psplat[3],
+                                                     const ssef idirsplat[3],
+                                                     const shuffle_swap_t shufflexyz[3],
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index df33a86bb18..267e098f912 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_shadow_all.h"
+#  include "kernel/bvh/qbvh_shadow_all.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -45,6 +45,7 @@ ccl_device_inline
 bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                  const Ray *ray,
                                  Intersection *isect_array,
+                                 const int skip_object,
                                  const uint max_hits,
                                  uint *num_hits)
 {
@@ -100,9 +101,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif  /* __KERNEL_SSE2__ */
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -189,6 +187,16 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
 
+#ifdef __SHADOW_TRICKS__
+						uint tri_object = (object == OBJECT_NONE)
+						        ? kernel_tex_fetch(__prim_object, prim_addr)
+						        : object;
+						if(tri_object == skip_object) {
+							++prim_addr;
+							continue;
+						}
+#endif
+
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -198,9 +206,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         PATH_RAY_SHADOW,
 								                         object,
 								                         prim_addr);
@@ -309,12 +317,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
 					num_hits_in_instance = 0;
 					isect_array->t = isect_t;
 
@@ -354,22 +361,17 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
 
-				triangle_intersect_precalc(dir, &isect_precalc);
-
 				/* scale isect->t to adjust for instancing */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
-
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
 			isect_t = tmax;
@@ -400,6 +402,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
+                                         const int skip_object,
                                          const uint max_hits,
                                          uint *num_hits)
 {
@@ -408,6 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
 		                                    ray,
 		                                    isect_array,
+		                                    skip_object,
 		                                    max_hits,
 		                                    num_hits);
 	}
@@ -418,6 +422,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
 		                                   ray,
 		                                   isect_array,
+		                                   skip_object,
 		                                   max_hits,
 		                                   num_hits);
 	}
diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h
index 889bbca21e2..bda7e34907a 100644
--- a/intern/cycles/kernel/bvh/bvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_subsurface.h"
+#  include "kernel/bvh/qbvh_subsurface.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -75,16 +75,16 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
+		isect_t = bvh_instance_motion_push(kg,
+		                                   subsurface_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
 #else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+		isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t);
 #endif
 		object = subsurface_object;
 	}
@@ -109,9 +109,6 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -197,9 +194,9 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
 							                              ss_isect,
 							                              P,
+							                              dir,
 							                              object,
 							                              prim_addr,
 							                              isect_t,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 80c8f31473a..c58d3b0316c 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_traversal.h"
+#  include "kernel/bvh/qbvh_traversal.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -104,9 +104,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -238,9 +235,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
-								                      &isect_precalc,
 								                      isect,
 								                      P,
+								                      dir,
 								                      visibility,
 								                      object,
 								                      prim_addr))
@@ -354,11 +351,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+					isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+					isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
-					triangle_intersect_precalc(dir, &isect_precalc);
 
 #  if defined(__KERNEL_SSE2__)
 					Psplat[0] = ssef(P.x);
@@ -391,11 +387,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
-			triangle_intersect_precalc(dir, &isect_precalc);
 
 #  if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 57e5b8d736d..764aaee44a1 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_volume.h"
+#  include "kernel/bvh/qbvh_volume.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -97,9 +97,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -194,9 +191,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									continue;
 								}
 								triangle_intersect(kg,
-								                   &isect_precalc,
 								                   isect,
 								                   P,
+								                   dir,
 								                   visibility,
 								                   object,
 								                   prim_addr);
@@ -238,13 +235,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
-
 #  if defined(__KERNEL_SSE2__)
 						Psplat[0] = ssef(P.x);
 						Psplat[1] = ssef(P.y);
@@ -281,13 +276,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 #  if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
 			Psplat[1] = ssef(P.y);
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 5a1accebaa0..04ec334e54d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_volume_all.h"
+#  include "kernel/bvh/qbvh_volume_all.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -101,9 +101,6 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif  /* __KERNEL_SSE2__ */
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -199,9 +196,9 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									continue;
 								}
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         visibility,
 								                         object,
 								                         prim_addr);
@@ -288,14 +285,12 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
@@ -341,20 +336,17 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #  else
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 				/* Scale isect->t to adjust for instancing. */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
 			isect_t = tmax;
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index 607295f9ed5..ce474438f2c 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -33,6 +33,7 @@
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              Intersection *isect_array,
+                                             const int skip_object,
                                              const uint max_hits,
                                              uint *num_hits)
 {
@@ -96,15 +97,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
 
 				if(false
 #ifdef __VISIBILITY_FLAG__
@@ -270,6 +269,16 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
 
+#ifdef __SHADOW_TRICKS__
+						uint tri_object = (object == OBJECT_NONE)
+						        ? kernel_tex_fetch(__prim_object, prim_addr)
+						        : object;
+						if(tri_object == skip_object) {
+							++prim_addr;
+							continue;
+						}
+#endif
+
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -279,9 +288,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         PATH_RAY_SHADOW,
 								                         object,
 								                         prim_addr);
@@ -390,9 +399,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
 					num_hits_in_instance = 0;
@@ -414,8 +423,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
-
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -445,11 +452,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
 			}
 
@@ -472,8 +478,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h
index 84dc4003133..be7658d11d7 100644
--- a/intern/cycles/kernel/bvh/qbvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -64,16 +64,16 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
+		isect_t = bvh_instance_motion_push(kg,
+		                                   subsurface_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
 #else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+		isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t);
 #endif
 		object = subsurface_object;
 	}
@@ -105,9 +105,6 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -253,9 +250,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
 							                              ss_isect,
 							                              P,
+							                              dir,
 							                              object,
 							                              prim_addr,
 							                              isect_t,
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index 10ae7bee852..fca75a1d416 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -106,15 +106,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
 
 				if(UNLIKELY(node_dist > isect->t)
 #if BVH_FEATURE(BVH_MOTION)
@@ -122,8 +120,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				   || UNLIKELY(ray->time > inodes.z)
 #endif
 #ifdef __VISIBILITY_FLAG__
-				   || (__float_as_uint(inodes.x) & visibility) == 0)
+				   || (__float_as_uint(inodes.x) & visibility) == 0
 #endif
+				 )
 				{
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
@@ -333,9 +332,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
-								                      &isect_precalc,
 								                      isect,
 								                      P,
+								                      dir,
 								                      visibility,
 								                      object,
 								                      prim_addr)) {
@@ -447,8 +446,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
-
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -468,9 +465,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 			qbvh_near_far_idx_calc(idir,
@@ -489,8 +486,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			node_dist = traversal_stack[stack_ptr].dist;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
index dc6627e2dbb..192ce009524 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -91,9 +91,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -266,7 +263,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr);
+								triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
 							}
 							break;
 						}
@@ -295,9 +292,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 						qbvh_near_far_idx_calc(idir,
@@ -316,8 +313,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
-
 						++stack_ptr;
 						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -341,9 +336,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 			qbvh_near_far_idx_calc(idir,
@@ -362,8 +357,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index ff1fa92af6e..ac5f58a9a51 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -95,9 +95,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -271,7 +268,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr);
+								hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
 								if(hit) {
 									/* Move on to next entry in intersections array. */
 									isect_array++;
@@ -346,9 +343,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
 						qbvh_near_far_idx_calc(idir,
@@ -367,7 +364,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
@@ -406,11 +402,10 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
 			}
 
@@ -433,8 +428,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index b7abc1ec507..e799855a65e 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
 {
 	kernel_assert(size <= sizeof(ShaderClosure));
 
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra);
+	int num_closure = sd->num_closure;
+	int num_closure_extra = sd->num_closure_extra;
 	if(num_closure + num_closure_extra >= MAX_CLOSURE)
 		return NULL;
 
-	ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure];
+	ShaderClosure *sc = &sd->closure[num_closure];
 
 	sc->type = type;
 	sc->weight = weight;
 
-	ccl_fetch(sd, num_closure)++;
+	sd->num_closure++;
 
 	return sc;
 }
@@ -44,25 +44,25 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
 	 * This lets us keep the same fast array iteration over closures, as we
 	 * found linked list iteration and iteration with skipping to be slower. */
 	int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra;
+	int num_closure = sd->num_closure;
+	int num_closure_extra = sd->num_closure_extra + num_extra;
 
 	if(num_closure + num_closure_extra > MAX_CLOSURE) {
 		/* Remove previous closure. */
-		ccl_fetch(sd, num_closure)--;
-		ccl_fetch(sd, num_closure_extra)++;
+		sd->num_closure--;
+		sd->num_closure_extra++;
 		return NULL;
 	}
 
-	ccl_fetch(sd, num_closure_extra) = num_closure_extra;
-	return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
+	sd->num_closure_extra = num_closure_extra;
+	return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra);
 }
 
 ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
 {
 	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
 
-	if(!sc)
+	if(sc == NULL)
 		return NULL;
 
 	float sample_weight = fabsf(average(weight));
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 04f9e711c7e..0302fa9b43e 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,77 +14,77 @@
  * limitations under the License.
  */
 
-#include "../closure/bsdf_ashikhmin_velvet.h"
-#include "../closure/bsdf_diffuse.h"
-#include "../closure/bsdf_oren_nayar.h"
-#include "../closure/bsdf_phong_ramp.h"
-#include "../closure/bsdf_diffuse_ramp.h"
-#include "../closure/bsdf_microfacet.h"
-#include "../closure/bsdf_microfacet_multi.h"
-#include "../closure/bsdf_reflection.h"
-#include "../closure/bsdf_refraction.h"
-#include "../closure/bsdf_transparent.h"
-#include "../closure/bsdf_ashikhmin_shirley.h"
-#include "../closure/bsdf_toon.h"
-#include "../closure/bsdf_hair.h"
-#include "../closure/bsdf_principled_diffuse.h"
-#include "../closure/bsdf_principled_sheen.h"
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
 #ifdef __SUBSURFACE__
-#  include "../closure/bssrdf.h"
+#  include "kernel/closure/bssrdf.h"
 #endif
 #ifdef __VOLUME__
-#  include "../closure/volume.h"
+#  include "kernel/closure/volume.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
 
 ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  const ShaderClosure *sc,
-                                  float randu,
-                                  float randv,
-                                  float3 *eval,
-                                  float3 *omega_in,
-                                  differential3 *domega_in,
-                                  float *pdf)
+                                       ShaderData *sd,
+                                       const ShaderClosure *sc,
+                                       float randu,
+                                       float randv,
+                                       float3 *eval,
+                                       float3 *omega_in,
+                                       differential3 *domega_in,
+                                       float *pdf)
 {
 	int label;
 
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __OSL__
 		case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
@@ -93,63 +93,63 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-			label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+			label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-			label = bsdf_principled_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
-			label = bsdf_principled_sheen_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -173,35 +173,35 @@ float3 bsdf_eval(KernelGlobals *kg,
 {
 	float3 eval;
 
-	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
+	if(dot(sd->Ng, omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __OSL__
 			case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
@@ -209,51 +209,51 @@ float3 bsdf_eval(KernelGlobals *kg,
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-				eval = bsdf_principled_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
-				eval = bsdf_principled_sheen_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
@@ -265,23 +265,23 @@ float3 bsdf_eval(KernelGlobals *kg,
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
@@ -289,51 +289,51 @@ float3 bsdf_eval(KernelGlobals *kg,
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-				eval = bsdf_principled_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
-				eval = bsdf_principled_sheen_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index b74e8ab97cf..58f6140970d 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -327,7 +327,7 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 	       (bsdf_a->alpha_y == bsdf_b->alpha_y) &&
 	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
 	       (bsdf_a->ior == bsdf_b->ior) &&
-	       ((!bsdf_a->extra && !bsdf_b->extra) ||
+	       ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
 	        ((bsdf_a->extra && bsdf_b->extra) &&
 	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
 }
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index a49b0717a3d..57f1e733ee7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -43,7 +43,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
 	if(cosI > 0.9999f || cosI < 1e-6f) {
-		const float r = sqrtf(randU.x / (1.0f - randU.x));
+		const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
 		const float phi = M_2PI_F * randU.y;
 		return make_float2(r*cosf(phi), r*sinf(phi));
 	}
@@ -83,7 +83,7 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
 	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
 
-	const float2 cossin_phi = normalize(make_float2(wi_11.x, wi_11.y));
+	const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
 	const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
 	const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y);
 
@@ -313,18 +313,18 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons
 
 #define MF_PHASE_FUNCTION glass
 #define MF_MULTI_GLASS
-#include "bsdf_microfacet_multi_impl.h"
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
 /* The diffuse phase function is not implemented as a node yet. */
 #if 0
 #define MF_PHASE_FUNCTION diffuse
 #define MF_MULTI_DIFFUSE
-#include "bsdf_microfacet_multi_impl.h"
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 #endif
 
 #define MF_PHASE_FUNCTION glossy
 #define MF_MULTI_GLOSSY
-#include "bsdf_microfacet_multi_impl.h"
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
 ccl_device void bsdf_microfacet_multi_ggx_blur(ShaderClosure *sc, float roughness)
 {
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 6838e26c242..c623e3490fd 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "geom_attribute.h"
-#include "geom_object.h"
+#include "kernel/geom/geom_attribute.h"
+#include "kernel/geom/geom_object.h"
 #ifdef __PATCH_EVAL__
-#  include "geom_patch.h"
+#  include "kernel/geom/geom_patch.h"
 #endif
-#include "geom_triangle.h"
-#include "geom_subd_triangle.h"
-#include "geom_triangle_intersect.h"
-#include "geom_motion_triangle.h"
-#include "geom_motion_triangle_intersect.h"
-#include "geom_motion_triangle_shader.h"
-#include "geom_motion_curve.h"
-#include "geom_curve.h"
-#include "geom_volume.h"
-#include "geom_primitive.h"
+#include "kernel/geom/geom_triangle.h"
+#include "kernel/geom/geom_subd_triangle.h"
+#include "kernel/geom/geom_triangle_intersect.h"
+#include "kernel/geom/geom_motion_triangle.h"
+#include "kernel/geom/geom_motion_triangle_intersect.h"
+#include "kernel/geom/geom_motion_triangle_shader.h"
+#include "kernel/geom/geom_motion_curve.h"
+#include "kernel/geom/geom_curve.h"
+#include "kernel/geom/geom_volume.h"
+#include "kernel/geom/geom_primitive.h"
 
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 08ccee56335..cc62192ef21 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
 ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return ATTR_PRIM_CURVE;
 	}
 	else
@@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id)
 {
-	if(ccl_fetch(sd, object) == PRIM_NONE) {
+	if(sd->object == PRIM_NONE) {
 		return attribute_not_found();
 	}
 
 	/* for SVM, find attribute by unique id */
-	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
 	attr_offset += attribute_primitive_type(kg, sd);
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
@@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh
 	AttributeDescriptor desc;
 	desc.element = (AttributeElement)attr_map.y;
 	
-	if(ccl_fetch(sd, prim) == PRIM_NONE &&
+	if(sd->prim == PRIM_NONE &&
 	   desc.element != ATTR_ELEMENT_MESH &&
 	   desc.element != ATTR_ELEMENT_VOXEL &&
 	   desc.element != ATTR_ELEMENT_OBJECT)
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 9de335403ce..bb33b91847e 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -22,6 +22,12 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __HAIR__
 
+#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
+#  define ccl_device_curveintersect ccl_device
+#else
+#  define ccl_device_curveintersect ccl_device_forceinline
+#endif
+
 /* Reading attributes on various curve elements */
 
 ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
@@ -32,22 +38,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +77,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +110,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float4 P_curve[2];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
 		}
 
-		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
 	}
 
 	return r*2.0f;
@@ -130,8 +136,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 	int k1 = k0 + 1;
 
 	float4 P_curve[2];
@@ -139,7 +145,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 
-	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
+	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
 }
 
 /* Curve tangent normal */
@@ -148,14 +154,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
 {	
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 
-		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
+		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
 		tgN = normalize(tgN);
 
 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
+		tgN = normalize(tgN - gd * sd->dPdu);
 #endif
 	}
 
@@ -222,13 +228,22 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
 
 #ifdef __KERNEL_SSE2__
 /* Pass P and dir by reference to aligned vector */
-ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 #else
-ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
 #endif
 {
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
 	float epsilon = 0.0f;
 	float r_st, r_en;
@@ -255,9 +270,9 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		int ka = max(k0 - 1, v00.x);
 		int kb = min(k1 + 1, v00.x + v00.y - 1);
 
-#ifdef __KERNEL_AVX2__
+#if defined(__KERNEL_AVX2__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
 		avxf P_curve_0_1, P_curve_2_3;
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
 			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
 		}
@@ -268,7 +283,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 #else  /* __KERNEL_AVX2__ */
 		ssef P_curve[4];
 
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
 			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
 			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
@@ -290,7 +305,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
 		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
 
-#ifdef __KERNEL_AVX2__
+#if defined(__KERNEL_AVX2__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
 		const avxf vPP = _mm256_broadcast_ps(&P.m128);
 		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
 		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
@@ -363,7 +378,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 
 		float4 P_curve[4];
 
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
@@ -679,7 +694,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 	return hit;
 }
 
-ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_curveintersect bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 {
 	/* define few macros to minimize code duplication for SSE */
@@ -689,6 +704,15 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #  define dot3(x, y) dot(x, y)
 #endif
 
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
 	/* curve Intersection check */
 	int flags = kernel_data.curve.curveflags;
@@ -703,7 +727,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #ifndef __KERNEL_SSE2__
 	float4 P_curve[2];
 
-	if(type & PRIMITIVE_CURVE) {
+	if(is_curve_primitive) {
 		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
 		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
@@ -738,7 +762,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #else
 	ssef P_curve[2];
 	
-	if(type & PRIMITIVE_CURVE) {
+	if(is_curve_primitive) {
 		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
 		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
 	}
@@ -948,7 +972,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -961,7 +985,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 	int prim = kernel_tex_fetch(__prim_index, isect->prim);
 	float4 v00 = kernel_tex_fetch(__curves, prim);
 
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 	int k1 = k0 + 1;
 
 	float3 tg;
@@ -972,14 +996,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 		float4 P_curve[4];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
 			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
 		}
 		else {
-			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
+			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
 		}
 
 		float3 p[4];
@@ -991,43 +1015,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		P = P + D*t;
 
 #ifdef __UV__
-		ccl_fetch(sd, u) = isect->u;
-		ccl_fetch(sd, v) = 0.0f;
+		sd->u = isect->u;
+		sd->v = 0.0f;
 #endif
 
 		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 
 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
 			/* direction from inside to surface of curve */
 			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			ccl_fetch(sd, Ng) = normalize(P - p_curr);
+			sd->Ng = normalize(P - p_curr);
 
 			/* adjustment for changing radius */
 			float gd = isect->v;
 
 			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
 			}
 		}
 
 		/* todo: sometimes the normal is still so that this is detected as
 		 * backfacing even if cull backfaces is enabled */
 
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
+		sd->N = sd->Ng;
 	}
 	else {
 		float4 P_curve[2];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
 		}
 
 		float l = 1.0f;
@@ -1038,39 +1062,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		float3 dif = P - float4_to_float3(P_curve[0]);
 
 #ifdef __UV__
-		ccl_fetch(sd, u) = dot(dif,tg)/l;
-		ccl_fetch(sd, v) = 0.0f;
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
 #endif
 
 		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
-			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
 		}
 		else {
 			float gd = isect->v;
 
 			/* direction from inside to surface of curve */
-			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
+			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
 
 			/* adjustment for changing radius */
 			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
 			}
 		}
 
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
+		sd->N = sd->Ng;
 	}
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = tg;
-	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
 #endif
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index d57d74ea882..f74995becf5 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -48,7 +48,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -77,7 +77,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -116,7 +116,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 #  ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
 #    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #    else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -144,7 +144,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #    else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -166,14 +166,15 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
  * time and do a ray intersection with the resulting triangle.
  */
 
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
-                                                 Intersection *isect,
-                                                 float3 P,
-                                                 float3 dir,
-                                                 float time,
-                                                 uint visibility,
-                                                 int object,
-                                                 int prim_addr)
+ccl_device_inline bool motion_triangle_intersect(
+        KernelGlobals *kg,
+        Intersection *isect,
+        float3 P,
+        float3 dir,
+        float time,
+        uint visibility,
+        int object,
+        int prim_addr)
 {
 	/* Primitive index for vertex location lookup. */
 	int prim = kernel_tex_fetch(__prim_index, prim_addr);
@@ -185,11 +186,15 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
 	motion_triangle_vertices(kg, fobject, prim, time, verts);
 	/* Ray-triangle intersection, unoptimized. */
 	float t, u, v;
-	if(ray_triangle_intersect_uv(P,
-	                             dir,
-	                             isect->t,
-	                             verts[2], verts[0], verts[1],
-	                             &u, &v, &t))
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          isect->t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          (ssef*)verts,
+#else
+	                          verts[0], verts[1], verts[2],
+#endif
+	                          &u, &v, &t))
 	{
 #ifdef __VISIBILITY_FLAG__
 		/* Visibility flag test. we do it here under the assumption
@@ -237,11 +242,15 @@ ccl_device_inline void motion_triangle_intersect_subsurface(
 	motion_triangle_vertices(kg, fobject, prim, time, verts);
 	/* Ray-triangle intersection, unoptimized. */
 	float t, u, v;
-	if(ray_triangle_intersect_uv(P,
-	                             dir,
-	                             tmax,
-	                             verts[2], verts[0], verts[1],
-	                             &u, &v, &t))
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          tmax,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          (ssef*)verts,
+#else
+	                          verts[0], verts[1], verts[2],
+#endif
+	                          &u, &v, &t))
 	{
 		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
 			if(ss_isect->hits[i].t == t) {
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 0e024a05db6..cb456056e20 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -39,26 +39,26 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
                                                       bool subsurface)
 {
 	/* Get shader. */
-	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+	sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 	/* Get motion info. */
 	/* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
 	 * can we de-duplicate something here?
 	 */
 	int numsteps, numverts;
-	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
+	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
 	/* Figure out which steps we need to fetch and their interpolation factor. */
 	int maxstep = numsteps*2;
-	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
-	float t = ccl_fetch(sd, time)*maxstep - step;
+	int step = min((int)(sd->time*maxstep), maxstep-1);
+	float t = sd->time*maxstep - step;
 	/* Find attribute. */
 	AttributeElement elem;
-	int offset = find_attribute_motion(kg, ccl_fetch(sd, object),
+	int offset = find_attribute_motion(kg, sd->object,
 	                                   ATTR_STD_MOTION_VERTEX_POSITION,
 	                                   &elem);
 	kernel_assert(offset != ATTR_STD_NOT_FOUND);
 	/* Fetch vertex coordinates. */
 	float3 verts[3], next_verts[3];
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
 	/* Interpolate between steps. */
@@ -68,7 +68,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 	/* Compute refined position. */
 #ifdef __SUBSURFACE__
 	if(subsurface) {
-		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg,
+		sd->P = motion_triangle_refine_subsurface(kg,
 		                                                     sd,
 		                                                     isect,
 		                                                     ray,
@@ -77,29 +77,29 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 	else
 #endif  /*  __SUBSURFACE__*/
 	{
-		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
+		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
 	}
 	/* Compute face normal. */
 	float3 Ng;
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
 	}
 	else {
 		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 	}
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, N) = Ng;
+	sd->Ng = Ng;
+	sd->N = Ng;
 	/* Compute derivatives of P w.r.t. uv. */
 #ifdef __DPDU__
-	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
-	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
+	sd->dPdu = (verts[0] - verts[2]);
+	sd->dPdv = (verts[1] - verts[2]);
 #endif
 	/* Compute smooth normal. */
-	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+	if(sd->shader & SHADER_SMOOTH_NORMAL) {
 		/* Find attribute. */
 		AttributeElement elem;
 		int offset = find_attribute_motion(kg,
-		                                   ccl_fetch(sd, object),
+		                                   sd->object,
 		                                   ATTR_STD_MOTION_VERTEX_NORMAL,
 		                                   &elem);
 		kernel_assert(offset != ATTR_STD_NOT_FOUND);
@@ -112,10 +112,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
 		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
 		/* Interpolate between vertices. */
-		float u = ccl_fetch(sd, u);
-		float v = ccl_fetch(sd, v);
+		float u = sd->u;
+		float v = sd->v;
 		float w = 1.0f - u - v;
-		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
+		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
 	}
 }
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index f51b2d18657..6ecdfe0173a 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -137,9 +137,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
+	*P = transform_point_auto(&sd->ob_tfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -149,9 +149,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
+	*P = transform_point_auto(&sd->ob_itfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -161,12 +161,12 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) {
-		*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
+	if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+		*N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
 	}
 #else
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	if(sd->object != OBJECT_NONE) {
+		Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 		*N = normalize(transform_direction_transposed(&tfm, *N));
 	}
 #endif
@@ -177,9 +177,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
+	*N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -189,9 +189,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
+	*D = transform_direction_auto(&sd->ob_tfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -201,9 +201,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
+	*D = transform_direction_auto(&sd->ob_itfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -212,13 +212,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(ccl_fetch(sd, object) == OBJECT_NONE)
+	if(sd->object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
+	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -326,7 +326,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1);
+	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE + 1);
 }
 
 /* Particle data from which object was instanced */
@@ -425,7 +425,13 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
+ccl_device_inline float bvh_instance_push(KernelGlobals *kg,
+                                          int object,
+                                          const Ray *ray,
+                                          float3 *P,
+                                          float3 *dir,
+                                          float3 *idir,
+                                          float t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -435,8 +441,11 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
 	*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
 	*idir = bvh_inverse_direction(*dir);
 
-	if(*t != FLT_MAX)
-		*t *= len;
+	if(t != FLT_MAX) {
+		t *= len;
+	}
+
+	return t;
 }
 
 #ifdef __QBVH__
@@ -473,16 +482,24 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
 
 /* Transorm ray to exit static object in BVH */
 
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
+ccl_device_inline float bvh_instance_pop(KernelGlobals *kg,
+                                         int object,
+                                         const Ray *ray,
+                                         float3 *P,
+                                         float3 *dir,
+                                         float3 *idir,
+                                         float t)
 {
-	if(*t != FLT_MAX) {
+	if(t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-		*t /= len(transform_direction(&tfm, ray->D));
+		t /= len(transform_direction(&tfm, ray->D));
 	}
 
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
+
+	return t;
 }
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
@@ -501,13 +518,13 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
                                                 int object,
                                                 const Ray *ray,
                                                 float3 *P,
                                                 float3 *dir,
                                                 float3 *idir,
-                                                ccl_addr_space float *t,
+                                                float t,
                                                 Transform *itfm)
 {
 	object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -518,8 +535,11 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg,
 	*dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
 	*idir = bvh_inverse_direction(*dir);
 
-	if(*t != FLT_MAX)
-		*t *= len;
+	if(t != FLT_MAX) {
+		t *= len;
+	}
+
+	return t;
 }
 
 #ifdef __QBVH__
@@ -557,22 +577,24 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg,
 
 /* Transorm ray to exit motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg,
-                                               int object,
-                                               const Ray *ray,
-                                               float3 *P,
-                                               float3 *dir,
-                                               float3 *idir,
-                                               ccl_addr_space float *t,
-                                               Transform *itfm)
-{
-	if(*t != FLT_MAX) {
-		*t /= len(transform_direction(itfm, ray->D));
+ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+                                                int object,
+                                                const Ray *ray,
+                                                float3 *P,
+                                                float3 *dir,
+                                                float3 *idir,
+                                                float t,
+                                                Transform *itfm)
+{
+	if(t != FLT_MAX) {
+		t /= len(transform_direction(itfm, ray->D));
 	}
 
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
+
+	return t;
 }
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 6a0ff5a4a04..5663b598508 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float val = 0.0f;
@@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
@@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 8a73bb2f78b..90a9c2147cc 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg,
                                                   const AttributeDescriptor desc,
                                                   float *dx, float *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
                                                     const AttributeDescriptor desc,
                                                     float3 *dx, float3 *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float3(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
+	if(sd->type & PRIMITIVE_ALL_CURVE)
 #  ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #  else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #  endif
@@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
+		return cross(sd->N, normalize(cross(data, sd->N)));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -153,17 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	float3 center;
 
 #ifdef __HAIR__
-	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
+	bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
 	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &center);
 		}
 	}
 	else
 #endif
-		center = ccl_fetch(sd, P);
+		center = sd->P;
 
 	float3 motion_pre = center, motion_post = center;
 
@@ -173,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	if(desc.offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
+		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
 		motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
-		desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
+		desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
 		motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
 #ifdef __HAIR__
-		if(is_curve_primitive && (ccl_fetch(sd, object_flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+		if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
 			object_position_transform(kg, sd, &motion_pre);
 			object_position_transform(kg, sd, &motion_post);
 		}
@@ -193,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);
 
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
 	float3 motion_center;
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 647840dc696..044e82f03d4 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
 {
-	return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0;
+	return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3])
 {
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 	uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x);
 	uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y);
@@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float a, dads, dadt;
 		a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
 		float2 uv[3];
@@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float3 a, dads, dadt;
 
@@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
 		float2 uv[3];
@@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 3229091bbb0..47778553b94 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -26,13 +26,13 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
 	const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
 	const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 
 	/* return normal */
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		return normalize(cross(v2 - v0, v1 - v0));
 	}
 	else {
@@ -110,34 +110,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y);
 		float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -153,24 +153,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float3 f0, f1, f2;
 
 		if(desc.element == ATTR_ELEMENT_CORNER) {
@@ -185,11 +185,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index 4db121d94f4..804e74d7e37 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -22,232 +22,50 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed
- * component of float3 value.
- */
-#ifndef __KERNEL_CPU__
-#  define IDX(vec, idx) \
-    ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) ))
-#else
-#  define IDX(vec, idx) ((vec)[idx])
-#endif
-
-/* Ray-Triangle intersection for BVH traversal
- *
- * Sven Woop
- * Watertight Ray/Triangle Intersection
- *
- * http://jcgt.org/published/0002/01/05/paper.pdf
- */
-
-/* Precalculated data for the ray->tri intersection. */
-typedef struct IsectPrecalc {
-	/* Maximal dimension kz, and orthogonal dimensions. */
-	int kx, ky, kz;
-
-	/* Shear constants. */
-	float Sx, Sy, Sz;
-} IsectPrecalc;
-
-#if (defined(__KERNEL_OPENCL_APPLE__)) || \
-    (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86)))
-ccl_device_noinline
-#else
-ccl_device_inline
-#endif
-void triangle_intersect_precalc(float3 dir,
-                                IsectPrecalc *isect_precalc)
-{
-	/* Calculate dimension where the ray direction is maximal. */
-#ifndef __KERNEL_SSE__
-	int kz = util_max_axis(make_float3(fabsf(dir.x),
-	                                   fabsf(dir.y),
-	                                   fabsf(dir.z)));
-	int kx = kz + 1; if(kx == 3) kx = 0;
-	int ky = kx + 1; if(ky == 3) ky = 0;
-#else
-	int kx, ky, kz;
-	/* Avoiding mispredicted branch on direction. */
-	kz = util_max_axis(fabs(dir));
-	static const char inc_xaxis[] = {1, 2, 0, 55};
-	static const char inc_yaxis[] = {2, 0, 1, 55};
-	kx = inc_xaxis[kz];
-	ky = inc_yaxis[kz];
-#endif
-
-	float dir_kz = IDX(dir, kz);
-
-	/* Swap kx and ky dimensions to preserve winding direction of triangles. */
-	if(dir_kz < 0.0f) {
-		int tmp = kx;
-		kx = ky;
-		ky = tmp;
-	}
-
-	/* Calculate the shear constants. */
-	float inv_dir_z = 1.0f / dir_kz;
-	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
-	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
-	isect_precalc->Sz = inv_dir_z;
-
-	/* Store the dimensions. */
-	isect_precalc->kx = kx;
-	isect_precalc->ky = ky;
-	isect_precalc->kz = kz;
-}
-
-/* TODO(sergey): Make it general utility function. */
-ccl_device_inline float xor_signmask(float x, int y)
-{
-	return __int_as_float(__float_as_int(x) ^ y);
-}
-
 ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
-                                          const IsectPrecalc *isect_precalc,
                                           Intersection *isect,
                                           float3 P,
+                                          float3 dir,
                                           uint visibility,
                                           int object,
                                           int prim_addr)
 {
-	const int kx = isect_precalc->kx;
-	const int ky = isect_precalc->ky;
-	const int kz = isect_precalc->kz;
-	const float Sx = isect_precalc->Sx;
-	const float Sy = isect_precalc->Sy;
-	const float Sz = isect_precalc->Sz;
-
-	/* Calculate vertices relative to ray origin. */
 	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-	const avxf avxf_P(P.m128, P.m128);
-
-	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
-	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
-
-	const avxf AB = tri_ab - avxf_P;
-	const avxf BC = tri_bc - avxf_P;
-
-	const __m256i permute_mask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
-
-	const avxf AB_k = shuffle(AB, permute_mask);
-	const avxf BC_k = shuffle(BC, permute_mask);
-
-	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
-	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
-
-	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
-	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
-
-	const avxf Sxy(Sy, Sx, Sy, Sx);
-
-	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
-	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
-
-	float ABBC_kz_array[8];
-	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
-
-	const float A_kz = ABBC_kz_array[0];
-	const float B_kz = ABBC_kz_array[2];
-	const float C_kz = ABBC_kz_array[6];
-
-	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
-	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
-
-	const avxf neg_mask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
-
-	/* W           U                             V
-	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
-	 */
-	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, neg_mask /* Dont care */);
-
-	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ neg_mask;
-
-	/* Calculate scaled barycentric coordinates. */
-	float WUVW_array[4];
-	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
-
-	const float W = WUVW_array[0];
-	const float U = WUVW_array[1];
-	const float V = WUVW_array[2];
-
-	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
-	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
-	                                               _mm256_setzero_ps(), 0));
-
-	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
-		return false;
-	}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
 #else
 	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
-	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
-	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
-	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
-
-	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
-	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
-	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
-
-	/* Perform shear and scale of vertices. */
-	const float Ax = A_kx - Sx * A_kz;
-	const float Ay = A_ky - Sy * A_kz;
-	const float Bx = B_kx - Sx * B_kz;
-	const float By = B_ky - Sy * B_kz;
-	const float Cx = C_kx - Sx * C_kz;
-	const float Cy = C_ky - Sy * C_kz;
-
-	/* Calculate scaled barycentric coordinates. */
-	float U = Cx * By - Cy * Bx;
-	float V = Ax * Cy - Ay * Cx;
-	float W = Bx * Ay - By * Ax;
-	if((U < 0.0f || V < 0.0f || W < 0.0f) &&
-	   (U > 0.0f || V > 0.0f || W > 0.0f))
-	{
-		return false;
-	}
 #endif
-
-	/* Calculate determinant. */
-	float det = U + V + W;
-	if(UNLIKELY(det == 0.0f)) {
-		return false;
-	}
-
-	/* Calculate scaled z-coordinates of vertices and use them to calculate
-	 * the hit distance.
-	 */
-	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
-	const int sign_det = (__float_as_int(det) & 0x80000000);
-	const float sign_T = xor_signmask(T, sign_det);
-	if((sign_T < 0.0f) ||
-	   (sign_T > isect->t * xor_signmask(det, sign_det)))
+	float t, u, v;
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          isect->t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          ssef_verts,
+#else
+	                          float4_to_float3(tri_a),
+	                          float4_to_float3(tri_b),
+	                          float4_to_float3(tri_c),
+#endif
+	                          &u, &v, &t))
 	{
-		return false;
-	}
-
 #ifdef __VISIBILITY_FLAG__
-	/* visibility flag test. we do it here under the assumption
-	 * that most triangles are culled by node flags */
-	if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
+		/* Visibility flag test. we do it here under the assumption
+		 * that most triangles are culled by node flags.
+		 */
+		if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
 #endif
-	{
-#ifdef __KERNEL_CUDA__
-		if(A == B && B == C) {
-			return false;
+		{
+			isect->prim = prim_addr;
+			isect->object = object;
+			isect->type = PRIMITIVE_TRIANGLE;
+			isect->u = u;
+			isect->v = v;
+			isect->t = t;
+			return true;
 		}
-#endif
-		/* Normalize U, V, W, and T. */
-		const float inv_det = 1.0f / det;
-		isect->prim = prim_addr;
-		isect->object = object;
-		isect->type = PRIMITIVE_TRIANGLE;
-		isect->u = U * inv_det;
-		isect->v = V * inv_det;
-		isect->t = T * inv_det;
-		return true;
 	}
 	return false;
 }
@@ -260,138 +78,37 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 #ifdef __SUBSURFACE__
 ccl_device_inline void triangle_intersect_subsurface(
         KernelGlobals *kg,
-        const IsectPrecalc *isect_precalc,
         SubsurfaceIntersection *ss_isect,
         float3 P,
+        float3 dir,
         int object,
         int prim_addr,
         float tmax,
         uint *lcg_state,
         int max_hits)
 {
-	const int kx = isect_precalc->kx;
-	const int ky = isect_precalc->ky;
-	const int kz = isect_precalc->kz;
-	const float Sx = isect_precalc->Sx;
-	const float Sy = isect_precalc->Sy;
-	const float Sz = isect_precalc->Sz;
-
-	/* Calculate vertices relative to ray origin. */
 	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
-	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
-	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
-	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-	const avxf avxf_P(P.m128, P.m128);
-
-	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
-	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
-
-	const avxf AB = tri_ab - avxf_P;
-	const avxf BC = tri_bc - avxf_P;
-
-	const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
-
-	const avxf AB_k = shuffle(AB, permuteMask);
-	const avxf BC_k = shuffle(BC, permuteMask);
-
-	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
-	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
-
-	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
-	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
-
-	const avxf Sxy(Sy, Sx, Sy, Sx);
-
-	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
-	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
-
-	float ABBC_kz_array[8];
-	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
-
-	const float A_kz = ABBC_kz_array[0];
-	const float B_kz = ABBC_kz_array[2];
-	const float C_kz = ABBC_kz_array[6];
-
-	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
-	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
-
-	const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
-
-	/* W           U                             V
-	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
-	 */
-	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
-
-	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
-
-	/* Calculate scaled barycentric coordinates. */
-	float WUVW_array[4];
-	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
-
-	const float W = WUVW_array[0];
-	const float U = WUVW_array[1];
-	const float V = WUVW_array[2];
-
-	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
-	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
-	                                               _mm256_setzero_ps(), 0));
-
-	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
-		return;
-	}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
 #else
-	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
-	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
-	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
-
-	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
-	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
-	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
-
-	/* Perform shear and scale of vertices. */
-	const float Ax = A_kx - Sx * A_kz;
-	const float Ay = A_ky - Sy * A_kz;
-	const float Bx = B_kx - Sx * B_kz;
-	const float By = B_ky - Sy * B_kz;
-	const float Cx = C_kx - Sx * C_kz;
-	const float Cy = C_ky - Sy * C_kz;
-
-	/* Calculate scaled barycentric coordinates. */
-	float U = Cx * By - Cy * Bx;
-	float V = Ax * Cy - Ay * Cx;
-	float W = Bx * Ay - By * Ax;
-
-	if((U < 0.0f || V < 0.0f || W < 0.0f) &&
-	   (U > 0.0f || V > 0.0f || W > 0.0f))
-	{
-		return;
-	}
+	const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
+	             tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
+	             tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
 #endif
-
-	/* Calculate determinant. */
-	float det = U + V + W;
-	if(UNLIKELY(det == 0.0f)) {
-		return;
-	}
-
-	/* Calculate scaled z−coordinates of vertices and use them to calculate
-	 * the hit distance.
-	 */
-	const int sign_det = (__float_as_int(det) & 0x80000000);
-	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
-	const float sign_T = xor_signmask(T, sign_det);
-	if((sign_T < 0.0f) ||
-	   (sign_T > tmax * xor_signmask(det, sign_det)))
+	float t, u, v;
+	if(!ray_triangle_intersect(P,
+	                           dir,
+	                           tmax,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                           ssef_verts,
+#else
+	                           tri_a, tri_b, tri_c,
+#endif
+	                           &u, &v, &t))
 	{
 		return;
 	}
 
-	/* Normalize U, V, W, and T. */
-	const float inv_det = 1.0f / det;
-
-	const float t = T * inv_det;
 	for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
 		if(ss_isect->hits[i].t == t) {
 			return;
@@ -418,18 +135,19 @@ ccl_device_inline void triangle_intersect_subsurface(
 	isect->prim = prim_addr;
 	isect->object = object;
 	isect->type = PRIMITIVE_TRIANGLE;
-	isect->u = U * inv_det;
-	isect->v = V * inv_det;
+	isect->u = u;
+	isect->v = v;
 	isect->t = t;
 
 	/* Record geometric normal. */
-	/* TODO(sergey): Use float4_to_float3() on just an edges. */
-	const float3 v0 = float4_to_float3(tri_a);
-	const float3 v1 = float4_to_float3(tri_b);
-	const float3 v2 = float4_to_float3(tri_c);
-	ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0));
-}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
+	             tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
+	             tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
 #endif
+	ss_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+}
+#endif  /* __SUBSURFACE__ */
 
 /* Refine triangle intersection to more precise hit point. For rays that travel
  * far the precision is often not so good, this reintersects the primitive from
@@ -457,7 +175,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #  endif
@@ -491,7 +209,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #  endif
@@ -519,7 +237,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -557,7 +275,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -570,6 +288,4 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 	return P;
 }
 
-#undef IDX
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 03724c955be..1e0ef5201c9 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -64,7 +64,7 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
 
 ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_CUDA__
 #  if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
@@ -91,7 +91,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 
 ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_CUDA__
 #  if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 9279a94c13a..06c0fb2fbca 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -19,7 +19,8 @@
 
 /* CPU Kernel Interface */
 
-#include "util_types.h"
+#include "util/util_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
 struct KernelGlobals;
+struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
@@ -46,31 +48,31 @@ void kernel_tex_copy(KernelGlobals *kg,
                      ExtensionType extension = EXTENSION_REPEAT);
 
 #define KERNEL_ARCH cpu
-#include "kernels/cpu/kernel_cpu.h"
+#include "kernel/kernels/cpu/kernel_cpu.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #  define KERNEL_ARCH cpu_sse2
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #  define KERNEL_ARCH cpu_sse3
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #  define KERNEL_ARCH cpu_sse41
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 #  define KERNEL_ARCH cpu_avx
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #  define KERNEL_ARCH cpu_avx2
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 6c3ee6b8098..823d30dde78 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -52,10 +52,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 	{
 		eval->diffuse = value;
 	}
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis = make_float3(0.0f, 0.0f, 0.0f);
+#endif
 }
 
-ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value, float mis_weight)
 {
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis += value;
+#endif
+	value *= mis_weight;
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
 		if(CLOSURE_IS_BSDF_DIFFUSE(type))
@@ -96,7 +103,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 	}
 }
 
-ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -115,8 +122,19 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 	}
 }
 
+ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+{
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis *= value;
+#endif
+	bsdf_eval_mis(eval, value);
+}
+
 ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis *= value;
+#endif
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
 		eval->diffuse *= value;
@@ -134,7 +152,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 #endif
 }
 
-ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval)
+ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -198,6 +216,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 	{
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 	}
+
+#ifdef __SHADOW_TRICKS__
+	L->path_total = make_float3(0.0f, 0.0f, 0.0f);
+	L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
+#endif
 }
 
 ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -252,7 +276,12 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
 	}
 }
 
-ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce)
+ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+                                              float3 throughput,
+                                              float3 alpha,
+                                              float3 bsdf,
+                                              float3 ao,
+                                              int bounce)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@@ -271,6 +300,26 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput
 	{
 		L->emission += throughput*bsdf*ao;
 	}
+
+#ifdef __SHADOW_TRICKS__
+	float3 light = throughput * bsdf;
+	L->path_total += light;
+	L->path_total_shaded += ao * light;
+#endif
+}
+
+ccl_device_inline void path_radiance_accum_total_ao(
+        PathRadiance *L,
+        float3 throughput,
+        float3 bsdf)
+{
+#ifdef __SHADOW_TRICKS__
+	L->path_total += throughput * bsdf;
+#else
+	(void) L;
+	(void) throughput;
+	(void) bsdf;
+#endif
 }
 
 ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
@@ -301,15 +350,38 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 	{
 		L->emission += throughput*bsdf_eval->diffuse*shadow;
 	}
+
+#ifdef __SHADOW_TRICKS__
+	float3 light = throughput * bsdf_eval->sum_no_mis;
+	L->path_total += light;
+	L->path_total_shaded += shadow * light;
+#endif
+}
+
+ccl_device_inline void path_radiance_accum_total_light(
+        PathRadiance *L,
+        float3 throughput,
+        const BsdfEval *bsdf_eval)
+{
+#ifdef __SHADOW_TRICKS__
+	L->path_total += throughput * bsdf_eval->sum_no_mis;
+#else
+	(void) L;
+	(void) throughput;
+	(void) bsdf_eval;
+#endif
 }
 
-ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
+                                                      ccl_addr_space PathState *state,
+                                                      float3 throughput,
+                                                      float3 value)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0)
+		if(state->bounce == 0)
 			L->background += throughput*value;
-		else if(bounce == 1)
+		else if(state->bounce == 1)
 			L->direct_emission += throughput*value;
 		else
 			L->indirect += throughput*value;
@@ -319,6 +391,13 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 th
 	{
 		L->emission += throughput*value;
 	}
+
+#ifdef __SHADOW_TRICKS__
+	L->path_total += throughput * value;
+	if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+		L->path_total_shaded += throughput * value;
+	}
+#endif
 }
 
 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -399,7 +478,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 		float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
 
 		/* Reject invalid value */
-		if(!isfinite(sum)) {
+		if(!isfinite_safe(sum)) {
 			kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
 			L_sum = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -468,7 +547,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 
 	/* Reject invalid value */
 	float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-	if(!isfinite(sum)) {
+	if(!isfinite_safe(sum)) {
 		kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
 		L_sum = make_float3(0.0f, 0.0f, 0.0f);
 	}
@@ -501,5 +580,34 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
 	L->emission += L_sample->emission * fac;
 }
 
-CCL_NAMESPACE_END
+#ifdef __SHADOW_TRICKS__
+/* Calculate current shadow of the path. */
+ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
+{
+	float path_total = average(L->path_total);
+	float path_total_shaded = average(L->path_total_shaded);
+	if(path_total != 0.0f) {
+		return path_total_shaded / path_total;
+	}
+	return 1.0f;
+}
 
+/* Calculate final light sum and transparency for shadow catcher object. */
+ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
+                                                         const PathRadiance *L,
+                                                         ccl_addr_space float* L_transparent)
+{
+	const float shadow = path_radiance_sum_shadow(L);
+	float3 L_sum;
+	if(kernel_data.background.transparent) {
+		*L_transparent = shadow;
+		L_sum = make_float3(0.0f, 0.0f, 0.0f);
+	}
+	else {
+		L_sum = L->shadow_color * shadow;
+	}
+	return L_sum;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 5bcc57cdcdf..f18d145f7cf 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -54,7 +54,8 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
 	shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
-	/* TODO, disable the closures we won't need */
+	/* TODO, disable more closures we don't need besides transparent */
+	shader_bsdf_disable_transparency(kg, sd);
 
 #ifdef __BRANCHED_PATH__
 	if(!kernel_data.integrator.branched) {
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index dedac6b1465..0df5217d97a 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -457,7 +457,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
 		Transform tfm = kernel_data.cam.worldtondc;
@@ -467,7 +467,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(ccl_fetch(sd, object) != OBJECT_NONE)
+		if(sd->object != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 9d1f3bdc918..ae7c9b836c4 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -35,15 +35,24 @@
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_simd.h"
-#include "util_half.h"
-#include "util_types.h"
-#include "util_texture.h"
+#include "util/util_debug.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+#include "util/util_texture.h"
 
 #define ccl_addr_space
 
+#define ccl_local_id(d) 0
+#define ccl_global_id(d) (kg->global_id[d])
+
+#define ccl_local_size(d) 1
+#define ccl_global_size(d) (kg->global_size[d])
+
+#define ccl_group_id(d) ccl_global_id(d)
+#define ccl_num_groups(d) ccl_global_size(d)
+
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
@@ -78,9 +87,9 @@ template<typename T> struct texture  {
 	ccl_always_inline avxf fetch_avxf(const int index)
 	{
 		kernel_assert(index >= 0 && (index+1) < width);
-		ssef *ssefData = (ssef*)data;
-		ssef *ssefNodeData = &ssefData[index];
-		return _mm256_loadu_ps((float *)ssefNodeData);
+		ssef *ssef_data = (ssef*)data;
+		ssef *ssef_node_data = &ssef_data[index];
+		return _mm256_loadu_ps((float *)ssef_node_data);
 	}
 
 #endif
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..39e98c7dda6 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -38,7 +38,7 @@
 
 #define ccl_device  __device__ __inline__
 #  define ccl_device_forceinline  __device__ __forceinline__
-#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
+#if __CUDA_ARCH__ < 500
 #  define ccl_device_inline  __device__ __forceinline__
 #else
 #  define ccl_device_inline  __device__ __inline__
@@ -46,6 +46,9 @@
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
 #define ccl_constant
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
@@ -57,8 +60,54 @@
 
 /* Types */
 
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+
+/* Work item functions */
+
+ccl_device_inline uint ccl_local_id(uint d)
+{
+	switch(d) {
+		case 0: return threadIdx.x;
+		case 1: return threadIdx.y;
+		case 2: return threadIdx.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+	switch(d) {
+		case 0: return blockDim.x;
+		case 1: return blockDim.y;
+		case 2: return blockDim.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+	switch(d) {
+		case 0: return blockIdx.x;
+		case 1: return blockIdx.y;
+		case 2: return blockIdx.z;
+		default: return 0;
+	}
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+	switch(d) {
+		case 0: return gridDim.x;
+		case 1: return gridDim.y;
+		case 2: return gridDim.z;
+		default: return 0;
+	}
+}
 
 /* Textures */
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index f076e3a7d37..c2263ac0d49 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -39,6 +39,7 @@
 #define ccl_constant __constant
 #define ccl_global __global
 #define ccl_local __local
+#define ccl_local_param __local
 #define ccl_private __private
 #define ccl_restrict restrict
 #define ccl_align(n) __attribute__((aligned(n)))
@@ -49,6 +50,15 @@
 #  define ccl_addr_space
 #endif
 
+#define ccl_local_id(d) get_local_id(d)
+#define ccl_global_id(d) get_global_id(d)
+
+#define ccl_local_size(d) get_local_size(d)
+#define ccl_global_size(d) get_global_size(d)
+
+#define ccl_group_id(d) get_group_id(d)
+#define ccl_num_groups(d) get_num_groups(d)
+
 /* Selective nodes compilation. */
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
@@ -133,8 +143,8 @@
 /* define NULL */
 #define NULL 0
 
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
 
 #endif /* __KERNEL_COMPAT_OPENCL_H__ */
 
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 8c7c651a053..9e7d51f23f5 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -67,7 +67,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		                         ls->shader, ls->object, ls->prim,
 		                         ls->u, ls->v, t, time, false, ls->lamp);
 
-		ls->Ng = ccl_fetch(emission_sd, Ng);
+		ls->Ng = emission_sd->Ng;
 
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
@@ -76,7 +76,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		path_state_modify_bounce(state, false);
 
 		/* evaluate emissive closure */
-		if(ccl_fetch(emission_sd, flag) & SD_EMISSION)
+		if(emission_sd->flag & SD_EMISSION)
 			eval = shader_emissive_eval(kg, emission_sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -112,7 +112,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	                                         -ls->D,
 	                                         dD,
 	                                         ls->t,
-	                                         ccl_fetch(sd, time));
+	                                         sd->time);
 
 	if(is_zero(light_eval))
 		return false;
@@ -120,7 +120,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	/* evaluate BSDF at shading point */
 
 #ifdef __VOLUME__
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
 	else {
 		float bsdf_pdf;
@@ -156,8 +156,13 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	if(bsdf_eval_is_zero(eval))
 		return false;
 
-	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold;
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f
+#ifdef __SHADOW_TRICKS__
+	   && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
+#endif
+	  )
+	{
+		float probability = max3(fabs(bsdf_eval_sum(eval))) * kernel_data.integrator.light_inv_rr_threshold;
 		if(probability < 1.0f) {
 			if(rand_terminate >= probability) {
 				return false;
@@ -168,8 +173,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
-		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
+		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
 
 		if(ls->t == FLT_MAX) {
 			/* distant light */
@@ -182,7 +187,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = differential3_zero();
 	}
 	else {
@@ -204,14 +209,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
+		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 8e66a3a0340..c9c97ea977e 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -16,6 +16,9 @@
 
 /* Constant Globals */
 
+#ifndef __KERNEL_GLOBALS_H__
+#define __KERNEL_GLOBALS_H__
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -44,7 +47,7 @@ typedef struct KernelGlobals {
 
 #  define KERNEL_TEX(type, ttype, name) ttype name;
 #  define KERNEL_IMAGE_TEX(type, ttype, name)
-#  include "kernel_textures.h"
+#  include "kernel/kernel_textures.h"
 
 	KernelData __data;
 
@@ -64,6 +67,13 @@ typedef struct KernelGlobals {
 	/* Storage for decoupled volume steps. */
 	VolumeStep *decoupled_volume_steps[2];
 	int decoupled_volume_steps_index;
+
+	/* split kernel */
+	SplitData split_data;
+	SplitParams split_param_data;
+
+	int2 global_size;
+	int2 global_id;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
@@ -76,7 +86,10 @@ typedef struct KernelGlobals {
 #ifdef __KERNEL_CUDA__
 
 __constant__ KernelData __data;
-typedef struct KernelGlobals {} KernelGlobals;
+typedef struct KernelGlobals {
+	/* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+	Intersection hits_stack[64];
+} KernelGlobals;
 
 #  ifdef __KERNEL_CUDA_TEX_STORAGE__
 #    define KERNEL_TEX(type, ttype, name) ttype name;
@@ -84,7 +97,7 @@ typedef struct KernelGlobals {} KernelGlobals;
 #    define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
 #  endif
 #  define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
-#  include "kernel_textures.h"
+#  include "kernel/kernel_textures.h"
 
 #endif  /* __KERNEL_CUDA__ */
 
@@ -97,11 +110,11 @@ typedef ccl_addr_space struct KernelGlobals {
 
 #  define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name;
-#  include "kernel_textures.h"
+#  include "kernel/kernel_textures.h"
 
 #  ifdef __SPLIT_KERNEL__
-	ShaderData *sd_input;
-	Intersection *isect_shadow;
+	SplitData split_data;
+	SplitParams split_param_data;
 #  endif
 } KernelGlobals;
 
@@ -143,3 +156,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o
 
 CCL_NAMESPACE_END
 
+#endif  /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 9bee5603474..bd0e23b7705 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -17,11 +17,11 @@
 #ifndef __KERNEL_MATH_H__
 #define __KERNEL_MATH_H__
 
-#include "util_color.h"
-#include "util_math.h"
-#include "util_math_fast.h"
-#include "util_texture.h"
-#include "util_transform.h"
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_math_intersect.h"
+#include "util/util_texture.h"
+#include "util/util_transform.h"
 
 #endif /* __KERNEL_MATH_H__ */
-
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 7aec47e4957..ed523696571 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -19,16 +19,16 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
 {
 	ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	atomic_add_and_fetch_float(buf, value);
 #else
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -39,12 +39,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
 #else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -57,7 +57,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 #else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
@@ -75,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
+		if(!(sd->flag & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
 
 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, ccl_fetch(sd, P));
+					float depth = camera_distance(kg, sd->P);
 					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
 				}
 				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, ccl_fetch(sd, object));
+					float id = object_pass_id(kg, sd->object);
 					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
 				}
 				if(flag & PASS_MATERIAL_ID) {
@@ -96,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 			}
 
 			if(flag & PASS_NORMAL) {
-				float3 normal = ccl_fetch(sd, N);
+				float3 normal = sd->N;
 				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
 			}
 			if(flag & PASS_UV) {
@@ -127,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-		float depth = camera_distance(kg, ccl_fetch(sd, P));
+		float depth = camera_distance(kg, sd->P);
 		float mist = saturate((depth - mist_start)*mist_inv_depth);
 
 		/* falloff */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index f90701a8260..e7957042182 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -15,40 +15,41 @@
  */
 
 #ifdef __OSL__
-#  include "osl_shader.h"
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_camera.h"
 
-#include "geom/geom.h"
-#include "bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
 
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
 
 #ifdef __SUBSURFACE__
-#  include "kernel_subsurface.h"
+#  include "kernel/kernel_subsurface.h"
 #endif
 
 #ifdef __VOLUME__
-#  include "kernel_volume.h"
+#  include "kernel/kernel_volume.h"
 #endif
 
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shadow.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_path_common.h"
+#include "kernel/kernel_path_surface.h"
+#include "kernel/kernel_path_volume.h"
+#include "kernel/kernel_path_subsurface.h"
 
 #ifdef __KERNEL_DEBUG__
-#  include "kernel_debug.h"
+#  include "kernel/kernel_debug.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -75,22 +76,25 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;
 
-		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+		light_ray.P = ray_offset(sd->P, sd->Ng);
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-		light_ray.time = ccl_fetch(sd, time);
+		light_ray.time = sd->time;
 #endif  /* __OBJECT_MOTION__ */
-		light_ray.dP = ccl_fetch(sd, dP);
+		light_ray.dP = sd->dP;
 		light_ray.dD = differential3_zero();
 
 		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
 			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
 		}
+		else {
+			path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+		}
 	}
 }
 
@@ -289,9 +293,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, emission_sd, state, ray);
 			path_radiance_accum_background(L,
+			                               state,
 			                               throughput,
-			                               L_background,
-			                               state->bounce);
+			                               L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -311,6 +315,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		shader_merge_closures(sd);
 #endif  /* __BRANCHED_PATH__ */
 
+#ifdef __SHADOW_TRICKS__
+		if(!(sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+			state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
+		}
+#endif  /* __SHADOW_TRICKS__ */
+
 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
 		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
@@ -373,7 +383,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
 			/* do bssrdf scatter step if we picked a bssrdf closure */
 			if(sc) {
-				uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+				uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
 
 				float bssrdf_u, bssrdf_v;
 				path_state_rng_2D(kg,
@@ -395,7 +405,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
 #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
 		if(kernel_data.integrator.use_direct_light) {
-			int all = kernel_data.integrator.sample_all_lights_indirect;
+			int all = (kernel_data.integrator.sample_all_lights_indirect) ||
+			          (state->flag & PATH_RAY_SHADOW_CATCHER);
 			kernel_branched_path_surface_connect_light(kg,
 			                                           rng,
 			                                           sd,
@@ -413,172 +424,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 	}
 }
 
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-bool kernel_path_subsurface_scatter(
-        KernelGlobals *kg,
-        ShaderData *sd,
-        ShaderData *emission_sd,
-        PathRadiance *L,
-        PathState *state,
-        RNG *rng,
-        Ray *ray,
-        float3 *throughput,
-        SubsurfaceIndirectRays *ss_indirect)
-{
-	float bssrdf_probability;
-	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
-
-	/* modify throughput for picking bssrdf or bsdf */
-	*throughput *= bssrdf_probability;
-
-	/* do bssrdf scatter step if we picked a bssrdf closure */
-	if(sc) {
-		/* We should never have two consecutive BSSRDF bounces,
-		 * the second one should be converted to a diffuse BSDF to
-		 * avoid this.
-		 */
-		kernel_assert(!ss_indirect->tracing);
-
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-
-		SubsurfaceIntersection ss_isect;
-		float bssrdf_u, bssrdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-		int num_hits = subsurface_scatter_multi_intersect(kg,
-		                                                  &ss_isect,
-		                                                  sd,
-		                                                  sc,
-		                                                  &lcg_state,
-		                                                  bssrdf_u, bssrdf_v,
-		                                                  false);
-#  ifdef __VOLUME__
-		ss_indirect->need_update_volume_stack =
-		        kernel_data.integrator.use_volumes &&
-		        ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif  /* __VOLUME__ */
-
-		/* compute lighting with the BSDF closure */
-		for(int hit = 0; hit < num_hits; hit++) {
-			/* NOTE: We reuse the existing ShaderData, we assume the path
-			 * integration loop stops when this function returns true.
-			 */
-			subsurface_scatter_multi_setup(kg,
-			                               &ss_isect,
-			                               hit,
-			                               sd,
-			                               state,
-			                               state->flag,
-			                               sc,
-			                               false);
-
-			PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
-			Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-			float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-			PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays];
-
-			*hit_state = *state;
-			*hit_ray = *ray;
-			*hit_tp = *throughput;
-
-			hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-			path_radiance_init(hit_L, kernel_data.film.use_light_pass);
-			hit_L->direct_throughput = L->direct_throughput;
-			path_radiance_copy_indirect(hit_L, L);
-
-			kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
-
-			if(kernel_path_surface_bounce(kg,
-			                              rng,
-			                              sd,
-			                              hit_tp,
-			                              hit_state,
-			                              hit_L,
-			                              hit_ray))
-			{
-#  ifdef __LAMP_MIS__
-				hit_state->ray_t = 0.0f;
-#  endif  /* __LAMP_MIS__ */
-
-#  ifdef __VOLUME__
-				if(ss_indirect->need_update_volume_stack) {
-					Ray volume_ray = *ray;
-					/* Setup ray from previous surface point to the new one. */
-					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
-					                             &volume_ray.t);
-
-					kernel_volume_stack_update_for_subsurface(
-					    kg,
-					    emission_sd,
-					    &volume_ray,
-					    hit_state->volume_stack);
-				}
-#  endif  /* __VOLUME__ */
-				path_radiance_reset_indirect(L);
-				ss_indirect->num_rays++;
-			}
-			else {
-				path_radiance_accum_sample(L, hit_L, 1);
-			}
-		}
-		return true;
-	}
-	return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
-        SubsurfaceIndirectRays *ss_indirect)
-{
-	ss_indirect->tracing = false;
-	ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_accum_indirect(
-        SubsurfaceIndirectRays *ss_indirect,
-        PathRadiance *L)
-{
-	if(ss_indirect->tracing) {
-		path_radiance_sum_indirect(L);
-		path_radiance_accum_sample(&ss_indirect->direct_L, L, 1);
-		if(ss_indirect->num_rays == 0) {
-			*L = ss_indirect->direct_L;
-		}
-	}
-}
-
-ccl_device void kernel_path_subsurface_setup_indirect(
-        KernelGlobals *kg,
-        SubsurfaceIndirectRays *ss_indirect,
-        PathState *state,
-        Ray *ray,
-        PathRadiance *L,
-        float3 *throughput)
-{
-	if(!ss_indirect->tracing) {
-		ss_indirect->direct_L = *L;
-	}
-	ss_indirect->tracing = true;
-
-	/* Setup state, ray and throughput for indirect SSS rays. */
-	ss_indirect->num_rays--;
-
-	Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
-	PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays];
-
-	*state = ss_indirect->state[ss_indirect->num_rays];
-	*ray = *indirect_ray;
-	*L = *indirect_L;
-	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
-	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif  /* __SUBSURFACE__ */
 
 ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
                                                RNG *rng,
@@ -631,7 +476,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			}
 
 			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
+			lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d);
 		}
 
 		if(state.bounce > kernel_data.integrator.ao_bounces) {
@@ -776,7 +621,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
+			path_radiance_accum_background(&L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -790,6 +635,21 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
+#ifdef __SHADOW_TRICKS__
+		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+			if(state.flag & PATH_RAY_CAMERA) {
+				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.catcher_object = sd.object;
+				if(!kernel_data.background.transparent) {
+					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+				}
+			}
+		}
+		else {
+			state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
+		}
+#endif  /* __SHADOW_TRICKS__ */
+
 		/* holdout */
 #ifdef __HOLDOUT__
 		if(((sd.flag & SD_HOLDOUT) ||
@@ -907,7 +767,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 	}
 #endif  /* __SUBSURFACE__ */
 
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
+	float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
+		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
+	}
+	else
+#endif  /* __SHADOW_TRICKS__ */
+	{
+		L_sum = path_radiance_clamp_and_sum(kg, &L);
+	}
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index ff2b828795d..36fd6c95fe7 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -42,21 +42,25 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 			Ray light_ray;
 			float3 ao_shadow;
 
-			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.P = ray_offset(sd->P, sd->Ng);
 			light_ray.D = ao_D;
 			light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-			light_ray.time = ccl_fetch(sd, time);
+			light_ray.time = sd->time;
 #endif  /* __OBJECT_MOTION__ */
-			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.dP = sd->dP;
 			light_ray.dD = differential3_zero();
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
+			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
 				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+			}
+			else {
+				path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+			}
 		}
 	}
 }
@@ -67,8 +71,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSDF(sc->type))
 			continue;
@@ -140,14 +144,14 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                         Ray *ray,
                                                         float3 throughput)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSSRDF(sc->type))
 			continue;
 
 		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+		uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
 		int num_samples = kernel_data.integrator.subsurface_samples;
 		float num_samples_inv = 1.0f/num_samples;
 		RNG bssrdf_rng = cmj_hash(*rng, i);
@@ -169,7 +173,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 			Ray volume_ray = *ray;
 			bool need_update_volume_stack =
 			        kernel_data.integrator.use_volumes &&
-			        ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
 #endif  /* __VOLUME__ */
 
 			/* compute lighting with the BSDF closure */
@@ -206,7 +210,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 #ifdef __EMISSION__
 				/* direct light */
 				if(kernel_data.integrator.use_direct_light) {
-					int all = kernel_data.integrator.sample_all_lights_direct;
+					int all = (kernel_data.integrator.sample_all_lights_direct) ||
+					          (state->flag & PATH_RAY_SHADOW_CATCHER);
 					kernel_branched_path_surface_connect_light(
 					        kg,
 					        rng,
@@ -280,7 +285,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}
 
 			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
+			lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d);
 		}
 
 		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
@@ -461,7 +466,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
+			path_radiance_accum_background(&L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -472,6 +477,21 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
 		shader_merge_closures(&sd);
 
+#ifdef __SHADOW_TRICKS__
+		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+			if(state.flag & PATH_RAY_CAMERA) {
+				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.catcher_object = sd.object;
+				if(!kernel_data.background.transparent) {
+					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+				}
+			}
+		}
+		else {
+			state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
+		}
+#endif  /* __SHADOW_TRICKS__ */
+
 		/* holdout */
 #ifdef __HOLDOUT__
 		if((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) {
@@ -544,7 +564,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __EMISSION__
 			/* direct light */
 			if(kernel_data.integrator.use_direct_light) {
-				int all = kernel_data.integrator.sample_all_lights_direct;
+				int all = (kernel_data.integrator.sample_all_lights_direct) ||
+				          (state.flag & PATH_RAY_SHADOW_CATCHER);
 				kernel_branched_path_surface_connect_light(kg, rng,
 					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
 			}
@@ -581,7 +602,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #endif  /* __VOLUME__ */
 	}
 
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
+	float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
+		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
+	}
+	else
+#endif  /* __SHADOW_TRICKS__ */
+	{
+		L_sum = path_radiance_clamp_and_sum(kg, &L);
+	}
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
 
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
index 7b903556bf9..82f83deb595 100644
--- a/intern/cycles/kernel/kernel_path_common.h
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "util_hash.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
                                                ccl_global uint *rng_state,
                                                int sample,
                                                int x, int y,
-                                               ccl_addr_space RNG *rng,
+                                               RNG *rng,
                                                ccl_addr_space Ray *ray)
 {
 	float filter_u;
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 661dc52fb31..c0cd2a63120 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init(KernelGlobals *kg,
                                        ShaderData *stack_sd,
                                        ccl_addr_space PathState *state,
-                                       ccl_addr_space RNG *rng,
+                                       RNG *rng,
                                        int sample,
                                        ccl_addr_space Ray *ray)
 {
@@ -54,6 +54,10 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 		state->volume_stack[0].shader = SHADER_NONE;
 	}
 #endif
+
+#ifdef __SHADOW_TRICKS__
+	state->catcher_object = OBJECT_NONE;
+#endif
 }
 
 ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
new file mode 100644
index 00000000000..10b568ac3dd
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+#  ifndef __KERNEL_CUDA__
+ccl_device
+#  else
+ccl_device_inline
+#  endif
+bool kernel_path_subsurface_scatter(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        RNG *rng,
+        ccl_addr_space Ray *ray,
+        ccl_addr_space float3 *throughput,
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
+{
+	float bssrdf_probability;
+	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+	/* modify throughput for picking bssrdf or bsdf */
+	*throughput *= bssrdf_probability;
+
+	/* do bssrdf scatter step if we picked a bssrdf closure */
+	if(sc) {
+		/* We should never have two consecutive BSSRDF bounces,
+		 * the second one should be converted to a diffuse BSDF to
+		 * avoid this.
+		 */
+		kernel_assert(!ss_indirect->tracing);
+
+		uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
+
+		SubsurfaceIntersection ss_isect;
+		float bssrdf_u, bssrdf_v;
+		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+		int num_hits = subsurface_scatter_multi_intersect(kg,
+		                                                  &ss_isect,
+		                                                  sd,
+		                                                  sc,
+		                                                  &lcg_state,
+		                                                  bssrdf_u, bssrdf_v,
+		                                                  false);
+#  ifdef __VOLUME__
+		ss_indirect->need_update_volume_stack =
+		        kernel_data.integrator.use_volumes &&
+		        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#  endif  /* __VOLUME__ */
+
+		/* compute lighting with the BSDF closure */
+		for(int hit = 0; hit < num_hits; hit++) {
+			/* NOTE: We reuse the existing ShaderData, we assume the path
+			 * integration loop stops when this function returns true.
+			 */
+			subsurface_scatter_multi_setup(kg,
+			                               &ss_isect,
+			                               hit,
+			                               sd,
+			                               state,
+			                               state->flag,
+			                               sc,
+			                               false);
+
+			ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
+			ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
+			ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
+			PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays];
+
+			*hit_state = *state;
+			*hit_ray = *ray;
+			*hit_tp = *throughput;
+
+			hit_state->rng_offset += PRNG_BOUNCE_NUM;
+
+			path_radiance_init(hit_L, kernel_data.film.use_light_pass);
+			hit_L->direct_throughput = L->direct_throughput;
+			path_radiance_copy_indirect(hit_L, L);
+
+			kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
+
+			if(kernel_path_surface_bounce(kg,
+			                              rng,
+			                              sd,
+			                              hit_tp,
+			                              hit_state,
+			                              hit_L,
+			                              hit_ray))
+			{
+#  ifdef __LAMP_MIS__
+				hit_state->ray_t = 0.0f;
+#  endif  /* __LAMP_MIS__ */
+
+#  ifdef __VOLUME__
+				if(ss_indirect->need_update_volume_stack) {
+					Ray volume_ray = *ray;
+					/* Setup ray from previous surface point to the new one. */
+					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_volume_stack_update_for_subsurface(
+					    kg,
+					    emission_sd,
+					    &volume_ray,
+					    hit_state->volume_stack);
+				}
+#  endif  /* __VOLUME__ */
+				path_radiance_reset_indirect(L);
+				ss_indirect->num_rays++;
+			}
+			else {
+				path_radiance_accum_sample(L, hit_L, 1);
+			}
+		}
+		return true;
+	}
+	return false;
+}
+
+ccl_device_inline void kernel_path_subsurface_init_indirect(
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
+{
+	ss_indirect->tracing = false;
+	ss_indirect->num_rays = 0;
+}
+
+ccl_device void kernel_path_subsurface_accum_indirect(
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
+        PathRadiance *L)
+{
+	if(ss_indirect->tracing) {
+		path_radiance_sum_indirect(L);
+		path_radiance_accum_sample(&ss_indirect->direct_L, L, 1);
+		if(ss_indirect->num_rays == 0) {
+			*L = ss_indirect->direct_L;
+		}
+	}
+}
+
+ccl_device void kernel_path_subsurface_setup_indirect(
+        KernelGlobals *kg,
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
+        ccl_addr_space PathState *state,
+        ccl_addr_space Ray *ray,
+        PathRadiance *L,
+        ccl_addr_space float3 *throughput)
+{
+	if(!ss_indirect->tracing) {
+		ss_indirect->direct_L = *L;
+	}
+	ss_indirect->tracing = true;
+
+	/* Setup state, ray and throughput for indirect SSS rays. */
+	ss_indirect->num_rays--;
+
+	ccl_addr_space Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
+	PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays];
+
+	*state = ss_indirect->state[ss_indirect->num_rays];
+	*ray = *indirect_ray;
+	*L = *indirect_L;
+	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
+
+	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
+}
+
+#endif  /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index fea503d06e5..076c82f3853 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -16,16 +16,22 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
-
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__)
 /* branched path tracing: connect path directly to position on one or more lights and add it to L */
-ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, PathState *state, float3 throughput,
-	float num_samples_adjust, PathRadiance *L, int sample_all_lights)
+ccl_device_noinline void kernel_branched_path_surface_connect_light(
+        KernelGlobals *kg,
+        RNG *rng,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float num_samples_adjust,
+        PathRadiance *L,
+        int sample_all_lights)
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
+	if(!(sd->flag & SD_BSDF_HAS_EVAL))
 		return;
 
 	Ray light_ray;
@@ -33,7 +39,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 	bool is_lamp;
 
 #  ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #  endif
 
 	if(sample_all_lights) {
@@ -52,7 +58,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 				float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
 
 				LightSample ls;
-				if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) {
+				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
 					/* The sampling probability returned by lamp_light_sample assumes that all lights were sampled.
 					 * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
@@ -66,6 +72,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 							/* accumulate */
 							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 						}
+						else {
+							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+						}
 					}
 				}
 			}
@@ -87,7 +96,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+				if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
@@ -100,6 +109,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 							/* accumulate */
 							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 						}
+						else {
+							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+						}
 					}
 				}
 			}
@@ -113,7 +125,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 		float terminate = path_state_rng_light_termination(kg, rng, state);
 
 		LightSample ls;
-		if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
@@ -123,6 +135,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 					/* accumulate */
 					path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
 				}
+				else {
+					path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+				}
 			}
 		}
 	}
@@ -130,9 +145,17 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 }
 
 /* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples,
-	float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device bool kernel_branched_path_surface_bounce(
+        KernelGlobals *kg,
+        RNG *rng,
+        ShaderData *sd,
+        const ShaderClosure *sc,
+        int sample,
+        int num_samples,
+        ccl_addr_space float3 *throughput,
+        ccl_addr_space PathState *state,
+        PathRadiance *L,
+        Ray *ray)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
@@ -156,15 +179,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	path_state_next(kg, state, label);
 
 	/* setup ray */
-	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 	ray->D = normalize(bsdf_omega_in);
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = ccl_fetch(sd, dP);
+	ray->dP = sd->dP;
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = ccl_fetch(sd, time);
+	ray->time = sd->time;
 #endif
 
 #ifdef __VOLUME__
@@ -188,15 +211,29 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 
 #endif
 
-#ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
 	PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+		return;
+
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           rng,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           1);
 		return;
+	}
+#endif
 
 	/* sample illumination from lights to find path contribution */
 	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
@@ -208,11 +245,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #endif
 
 	LightSample ls;
-	if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 		float terminate = path_state_rng_light_termination(kg, rng, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
@@ -222,15 +259,17 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 				/* accumulate */
 				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
 			}
+			else {
+				path_radiance_accum_total_light(L, throughput, &L_light);
+			}
 		}
 	}
 #endif
 }
-#endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
 ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           ccl_addr_space RNG *rng,
+                                           RNG *rng,
                                            ShaderData *sd,
                                            ccl_addr_space float3 *throughput,
                                            ccl_addr_space PathState *state,
@@ -238,7 +277,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
                                            ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(ccl_fetch(sd, flag) & SD_BSDF) {
+	if(sd->flag & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
@@ -270,16 +309,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		path_state_next(kg, state, label);
 
 		/* setup ray */
-		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 		ray->D = normalize(bsdf_omega_in);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = bsdf_domega_in;
 #endif
 
@@ -291,21 +330,21 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
+	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
 		/* no surface shader but have a volume shader? act transparent */
 
 		/* update path state, count as transparent */
 		path_state_next(kg, state, LABEL_TRANSPARENT);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, -sd->Ng);
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 #endif
 
 		/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 3d3b7385d8b..371f2c1c7cb 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -24,7 +24,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
         ShaderData *sd,
         ShaderData *emission_sd,
         float3 throughput,
-        PathState *state,
+        ccl_addr_space PathState *state,
         PathRadiance *L)
 {
 #ifdef __EMISSION__
@@ -59,7 +59,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
 			}
 		}
 	}
-#endif
+#endif /* __EMISSION__ */
 }
 
 #ifdef __KERNEL_GPU__
@@ -67,8 +67,14 @@ ccl_device_noinline
 #else
 ccl_device
 #endif
-bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+bool kernel_path_volume_bounce(
+    KernelGlobals *kg,
+    RNG *rng,
+    ShaderData *sd,
+    ccl_addr_space float3 *throughput,
+    ccl_addr_space PathState *state,
+    PathRadiance *L,
+    ccl_addr_space Ray *ray)
 {
 	/* sample phase function */
 	float phase_pdf;
@@ -111,9 +117,18 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	return true;
 }
 
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L,
-	bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+#ifndef __SPLIT_KERNEL__
+ccl_device void kernel_branched_path_volume_connect_light(
+        KernelGlobals *kg,
+        RNG *rng,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        float3 throughput,
+        ccl_addr_space PathState *state,
+        PathRadiance *L,
+        bool sample_all_lights,
+        Ray *ray,
+        const VolumeSegment *segment)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
@@ -261,10 +276,11 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 			}
 		}
 	}
-#endif
+#endif /* __EMISSION__ */
 }
+#endif /* __SPLIT_KERNEL__ */
 
-#endif
+#endif /* __VOLUME_SCATTER__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index cf5614b8a86..96bc636d5ac 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -17,12 +17,15 @@
 #ifndef __KERNEL_QUEUE_H__
 #define __KERNEL_QUEUE_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Queue utility functions for split kernel
  */
-
+#ifdef __KERNEL_OPENCL__
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#endif
 
 /*
  * Enqueue ray index into the queue
@@ -35,7 +38,8 @@ ccl_device void enqueue_ray_index(
         ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
 {
 	/* This thread's queue index. */
-	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number])
+	                   + (queue_number * queue_size);
 	queues[my_queue_index] = ray_index;
 }
 
@@ -47,6 +51,7 @@ ccl_device void enqueue_ray_index(
  * is no more ray to allocate to other threads.
  */
 ccl_device int get_ray_index(
+        KernelGlobals *kg,
         int thread_index,       /* Global thread index. */
         int queue_number,       /* Queue to operate on. */
         ccl_global int *queues, /* Buffer of all queues. */
@@ -68,24 +73,25 @@ ccl_device void enqueue_ray_index_local(
         int queue_number,                            /* Queue in which to enqueue ray index. */
         char enqueue_flag,                           /* True for threads whose ray index has to be enqueued. */
         int queuesize,                               /* queue size. */
-        ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics. */
+        ccl_local_param unsigned int *local_queue_atomics,   /* To to local queue atomics. */
         ccl_global int *Queue_data,                  /* Queues. */
         ccl_global int *Queue_index)                 /* To do global queue atomics. */
 {
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
 
 	/* Get local queue id .*/
 	unsigned int lqidx;
 	if(enqueue_flag) {
-		lqidx = atomic_inc(local_queue_atomics);
+		lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue offset. */
 	if(lidx == 0) {
-		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+		*local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number],
+		                                                   *local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue index and enqueue ray. */
 	if(enqueue_flag) {
@@ -96,19 +102,19 @@ ccl_device void enqueue_ray_index_local(
 
 ccl_device unsigned int get_local_queue_index(
         int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-        ccl_local unsigned int *local_queue_atomics)
+        ccl_local_param unsigned int *local_queue_atomics)
 {
-	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
 	return my_lqidx;
 }
 
 ccl_device unsigned int get_global_per_queue_offset(
         int queue_number,
-        ccl_local unsigned int *local_queue_atomics,
+        ccl_local_param unsigned int *local_queue_atomics,
         ccl_global int* global_queue_atomics)
 {
-	unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number],
-	                                       local_queue_atomics[queue_number]);
+	unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number],
+	                                                        local_queue_atomics[queue_number]);
 	return queue_offset;
 }
 
@@ -116,10 +122,12 @@ ccl_device unsigned int get_global_queue_index(
     int queue_number,
     int queuesize,
     unsigned int lqidx,
-    ccl_local unsigned int * global_per_queue_offset)
+    ccl_local_param unsigned int * global_per_queue_offset)
 {
 	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
 	return my_gqidx;
 }
 
+CCL_NAMESPACE_END
+
 #endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index e773753396f..d4f0caff5de 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_jitter.h"
+#include "kernel/kernel_jitter.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }
 
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -130,7 +130,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -147,7 +147,7 @@ ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *r
 	}
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@@ -191,14 +191,14 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG
 
 /* Linear Congruential Generator */
 
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
 {
 	/* implicit mod 2^32 */
-	rng = (1103515245*(rng) + 12345);
-	return (float)rng * (1.0f/(float)0xFFFFFFFF);
+	*rng = (1103515245*(*rng) + 12345);
+	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
 }
 
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 	*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
 	*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
@@ -259,12 +259,12 @@ ccl_device uint lcg_init(uint seed)
  * For branches in the path we must be careful not to reuse the same number
  * in a sequence and offset accordingly. */
 
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	/* the rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
@@ -277,29 +277,29 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_ad
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
 {
 	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
 
 /* Utitility functions to get light termination value, since it might not be needed in many cases. */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state)
+ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
 		return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
@@ -307,7 +307,7 @@ ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_
 	return 0.0f;
 }
 
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches)
+ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
 		return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
@@ -315,7 +315,7 @@ ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, c
 	return 0.0f;
 }
 
-ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches)
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int branch, int num_branches)
 {
 	/* path is splitting into a branch, adjust so that each branch
 	 * still gets a unique sample from the same sequence */
@@ -324,18 +324,9 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b
 	state->num_samples = state->num_samples*num_branches;
 }
 
-ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
-{
-	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
-}
-
-/* TODO(sergey): For until we can use generic address space from OpenCL 2.0. */
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space RNG *rng,
-                                                const ccl_addr_space PathState *state,
-                                                uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng, int rng_offset, int sample, uint scramble)
 {
-	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
+	return lcg_init(*rng + rng_offset + sample*scramble);
 }
 
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index d0826e5e879..8c0c5e90a3e 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -24,12 +24,12 @@
  *
  */
 
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf.h"
-#include "closure/emissive.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/emissive.h"
 
-#include "svm/svm.h"
+#include "kernel/svm/svm.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN
 #ifdef __OBJECT_MOTION__
 ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_MOTION) {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
-		ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
+	if(sd->object_flag & SD_OBJECT_MOTION) {
+		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
+		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
 	}
 	else {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
-		ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+		sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	}
 }
 #endif
@@ -55,55 +55,55 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
                                                const Ray *ray)
 {
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 
-	ccl_fetch(sd, type) = isect->type;
-	ccl_fetch(sd, flag) = 0;
-	ccl_fetch(sd, object_flag) = kernel_tex_fetch(__object_flag,
-	                                              ccl_fetch(sd, object));
+	sd->type = isect->type;
+	sd->flag = 0;
+	sd->object_flag = kernel_tex_fetch(__object_flag,
+	                                              sd->object);
 
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	ccl_fetch(sd, time) = ray->time;
+	sd->time = ray->time;
 #endif
 
-	ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
-	ccl_fetch(sd, ray_length) = isect->t;
+	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+	sd->ray_length = isect->t;
 
 #ifdef __UV__
-	ccl_fetch(sd, u) = isect->u;
-	ccl_fetch(sd, v) = isect->v;
+	sd->u = isect->u;
+	sd->v = isect->v;
 #endif
 
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
 
-		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
-		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
+		sd->shader = __float_as_int(curvedata.z);
+		sd->P = bvh_curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* vectors */
-		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
-		ccl_fetch(sd, Ng) = Ng;
-		ccl_fetch(sd, N) = Ng;
+		sd->P = triangle_refine(kg, sd, isect, ray);
+		sd->Ng = Ng;
+		sd->N = Ng;
 		
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL)
+			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
 	}
 	else {
@@ -111,40 +111,40 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}
 
-	ccl_fetch(sd, I) = -ray->D;
+	sd->I = -ray->D;
 
-	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
+	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #  ifdef __DPDU__
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #  endif
 	}
 #endif
 
 	/* backfacing test */
-	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 	if(backfacing) {
-		ccl_fetch(sd, flag) |= SD_BACKFACING;
-		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+		sd->flag |= SD_BACKFACING;
+		sd->Ng = -sd->Ng;
+		sd->N = -sd->N;
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+		sd->dPdu = -sd->dPdu;
+		sd->dPdv = -sd->dPdv;
 #endif
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
-	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
-	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
+	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
+	differential_incoming(&sd->dI, ray->dD);
+	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
 }
 
@@ -203,11 +203,11 @@ void shader_setup_from_subsurface(
 #  ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform(kg, sd, &sd->N);
-		object_normal_transform(kg, sd, &sd->Ng);
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #    ifdef __DPDU__
-		object_dir_transform(kg, sd, &sd->dPdu);
-		object_dir_transform(kg, sd, &sd->dPdv);
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #    endif
 	}
 #  endif
@@ -249,106 +249,106 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 int lamp)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = P;
-	ccl_fetch(sd, N) = Ng;
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, I) = I;
-	ccl_fetch(sd, shader) = shader;
+	sd->P = P;
+	sd->N = Ng;
+	sd->Ng = Ng;
+	sd->I = I;
+	sd->shader = shader;
 	if(prim != PRIM_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE;
+		sd->type = PRIMITIVE_TRIANGLE;
 	else if(lamp != LAMP_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_LAMP;
+		sd->type = PRIMITIVE_LAMP;
 	else
-		ccl_fetch(sd, type) = PRIMITIVE_NONE;
+		sd->type = PRIMITIVE_NONE;
 
 	/* primitive */
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = object;
+	sd->object = object;
 #endif
 	/* currently no access to bvh prim index for strand sd->prim*/
-	ccl_fetch(sd, prim) = prim;
+	sd->prim = prim;
 #ifdef __UV__
-	ccl_fetch(sd, u) = u;
-	ccl_fetch(sd, v) = v;
+	sd->u = u;
+	sd->v = v;
 #endif
-	ccl_fetch(sd, ray_length) = t;
+	sd->ray_length = t;
 
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	ccl_fetch(sd, object_flag) = 0;
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		ccl_fetch(sd, object_flag) |= kernel_tex_fetch(__object_flag,
-		                                               ccl_fetch(sd, object));
+	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->object_flag = 0;
+	if(sd->object != OBJECT_NONE) {
+		sd->object_flag |= kernel_tex_fetch(__object_flag,
+		                                               sd->object);
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
-		ccl_fetch(sd, time) = time;
+		sd->time = time;
 	}
 	else if(lamp != LAMP_NONE) {
-		ccl_fetch(sd, ob_tfm)  = lamp_fetch_transform(kg, lamp, false);
-		ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true);
+		sd->ob_tfm  = lamp_fetch_transform(kg, lamp, false);
+		sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
 #endif
 	}
 
 	/* transform into world space */
 	if(object_space) {
-		object_position_transform_auto(kg, sd, &ccl_fetch(sd, P));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I));
+		object_position_transform_auto(kg, sd, &sd->P);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
+		sd->N = sd->Ng;
+		object_dir_transform_auto(kg, sd, &sd->I);
 	}
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
 
 #ifdef __INSTANCING__
-			if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
-				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+			if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+				object_normal_transform_auto(kg, sd, &sd->N);
 			}
 #endif
 		}
 
 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
 #  ifdef __INSTANCING__
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+			object_dir_transform_auto(kg, sd, &sd->dPdu);
+			object_dir_transform_auto(kg, sd, &sd->dPdv);
 		}
 #  endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}
 
 	/* backfacing test */
-	if(ccl_fetch(sd, prim) != PRIM_NONE) {
-		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	if(sd->prim != PRIM_NONE) {
+		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 		if(backfacing) {
-			ccl_fetch(sd, flag) |= SD_BACKFACING;
-			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+			sd->flag |= SD_BACKFACING;
+			sd->Ng = -sd->Ng;
+			sd->N = -sd->N;
 #ifdef __DPDU__
-			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+			sd->dPdu = -sd->dPdu;
+			sd->dPdv = -sd->dPdv;
 #endif
 		}
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	ccl_fetch(sd, dP) = differential3_zero();
-	ccl_fetch(sd, dI) = differential3_zero();
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = differential3_zero();
+	sd->dI = differential3_zero();
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -378,39 +378,39 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = ray->D;
-	ccl_fetch(sd, N) = -ray->D;
-	ccl_fetch(sd, Ng) = -ray->D;
-	ccl_fetch(sd, I) = -ray->D;
-	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	ccl_fetch(sd, object_flag) = 0;
+	sd->P = ray->D;
+	sd->N = -ray->D;
+	sd->Ng = -ray->D;
+	sd->I = -ray->D;
+	sd->shader = kernel_data.background.surface_shader;
+	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->object_flag = 0;
 #ifdef __OBJECT_MOTION__
-	ccl_fetch(sd, time) = ray->time;
+	sd->time = ray->time;
 #endif
-	ccl_fetch(sd, ray_length) = 0.0f;
+	sd->ray_length = 0.0f;
 
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = PRIM_NONE;
+	sd->object = PRIM_NONE;
 #endif
-	ccl_fetch(sd, prim) = PRIM_NONE;
+	sd->prim = PRIM_NONE;
 #ifdef __UV__
-	ccl_fetch(sd, u) = 0.0f;
-	ccl_fetch(sd, v) = 0.0f;
+	sd->u = 0.0f;
+	sd->v = 0.0f;
 #endif
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	ccl_fetch(sd, dP) = ray->dD;
-	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = ray->dD;
+	differential_incoming(&sd->dI, sd->dP);
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -505,18 +505,18 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+	for(int i = 0; i < sd->num_closure; i++) {
 		if(i == skip_bsdf)
 			continue;
 
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
 
 			if(bsdf_pdf != 0.0f) {
-				bsdf_eval_accum(result_eval, sc->type, eval*sc->weight);
+				bsdf_eval_accum(result_eval, sc->type, eval*sc->weight, 1.0f);
 				sum_pdf += bsdf_pdf*sc->sample_weight;
 			}
 
@@ -535,8 +535,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
                                                         float light_pdf,
                                                         bool use_mis)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
@@ -544,7 +544,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
 				float mis_weight = use_mis? power_heuristic(light_pdf, bsdf_pdf): 1.0f;
 				bsdf_eval_accum(result_eval,
 				                sc->type,
-				                eval * sc->weight * mis_weight);
+				                eval * sc->weight,
+				                mis_weight);
 			}
 		}
 	}
@@ -576,7 +577,7 @@ void shader_bsdf_eval(KernelGlobals *kg,
 		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
 		if(use_mis) {
 			float weight = power_heuristic(light_pdf, pdf);
-			bsdf_eval_mul(eval, weight);
+			bsdf_eval_mis(eval, weight);
 		}
 	}
 }
@@ -591,22 +592,22 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 {
 	int sampled = 0;
 
-	if(ccl_fetch(sd, num_closure) > 1) {
+	if(sd->num_closure > 1) {
 		/* pick a BSDF closure based on sample weights */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+		for(sampled = 0; sampled < sd->num_closure; sampled++) {
+			const ShaderClosure *sc = &sd->closure[sampled];
 			
 			if(CLOSURE_IS_BSDF(sc->type))
 				sum += sc->sample_weight;
 		}
 
-		float r = ccl_fetch(sd, randb_closure)*sum;
+		float r = sd->randb_closure*sum;
 		sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+		for(sampled = 0; sampled < sd->num_closure; sampled++) {
+			const ShaderClosure *sc = &sd->closure[sampled];
 			
 			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
@@ -616,13 +617,13 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 			}
 		}
 
-		if(sampled == ccl_fetch(sd, num_closure)) {
+		if(sampled == sd->num_closure) {
 			*pdf = 0.0f;
 			return LABEL_NONE;
 		}
 	}
 
-	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+	const ShaderClosure *sc = &sd->closure[sampled];
 
 	int label;
 	float3 eval;
@@ -633,7 +634,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
 
-		if(ccl_fetch(sd, num_closure) > 1) {
+		if(sd->num_closure > 1) {
 			float sweight = sc->sample_weight;
 			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
 		}
@@ -660,8 +661,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
@@ -670,13 +671,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
 
 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
-	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
+	if(sd->flag & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@@ -685,6 +686,18 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 	return eval;
 }
 
+ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+{
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+			sc->sample_weight = 0.0f;
+			sc->weight = make_float3(0.0f, 0.0f, 0.0f);
+		}
+	}
+}
+
 ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 alpha = make_float3(1.0f, 1.0f, 1.0f) - shader_bsdf_transparency(kg, sd);
@@ -699,8 +712,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@@ -713,8 +726,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@@ -727,8 +740,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@@ -741,8 +754,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
@@ -756,8 +769,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
@@ -766,12 +779,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += ccl_fetch(sd, N)*average(sc->weight);
+			N += sd->N*average(sc->weight);
 		}
 	}
 
 	if(is_zero(N))
-		N = ccl_fetch(sd, N);
+		N = sd->N;
 	else
 		N = normalize(N);
 
@@ -786,8 +799,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			const Bssrdf *bssrdf = (const Bssrdf*)sc;
@@ -801,10 +814,10 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}
 
 	if(N_)
-		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
+		*N_ = (is_zero(N))? sd->N: normalize(N);
 
 	if(texture_blur_)
-		*texture_blur_ = texture_blur/weight_sum;
+		*texture_blur_ = safe_divide(texture_blur, weight_sum);
 	
 	return eval;
 }
@@ -814,7 +827,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 
 ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
-	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
+	return emissive_simple_eval(sd->Ng, sd->I);
 }
 
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -822,8 +835,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 	float3 eval;
 	eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -838,8 +851,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@@ -850,12 +863,12 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng,
+ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng,
 	ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = randb;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = randb;
 
 #ifdef __OSL__
 	if(kg->osl)
@@ -869,13 +882,13 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
 		                                             sizeof(DiffuseBsdf),
 		                                             make_float3(0.8f, 0.8f, 0.8f));
-		bsdf->N = ccl_fetch(sd, N);
-		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf);
+		bsdf->N = sd->N;
+		sd->flag |= bsdf_diffuse_setup(bsdf);
 #endif
 	}
 
-	if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) {
-		ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953);
+	if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+		sd->lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0xb4bc3953);
 	}
 }
 
@@ -884,9 +897,9 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
 	ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = 0.0f;
 
 #ifdef __SVM__
 #ifdef __OSL__
@@ -901,8 +914,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BACKGROUND(sc->type))
 			eval += sc->weight;
@@ -932,7 +945,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 			float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
 
 			if(phase_pdf != 0.0f) {
-				bsdf_eval_accum(result_eval, sc->type, eval);
+				bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
 				sum_pdf += phase_pdf*sc->sample_weight;
 			}
 
@@ -1024,8 +1037,8 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
 
 ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
                                           ShaderData *sd,
-                                          PathState *state,
-                                          VolumeStack *stack,
+                                          ccl_addr_space PathState *state,
+                                          ccl_addr_space VolumeStack *stack,
                                           int path_flag,
                                           ShaderContext ctx)
 {
@@ -1081,9 +1094,9 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = 0.0f;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 2981f6ac566..0426e0a62c9 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -16,9 +16,84 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __SHADOW_RECORD_ALL__
+/* Attenuate throughput accordingly to the given intersection event.
+ * Returns true if the throughput is zero and traversal can be aborted.
+ */
+ccl_device_forceinline bool shadow_handle_transparent_isect(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+#    ifdef __VOLUME__
+        ccl_addr_space struct PathState *volume_state,
+#    endif
+        Intersection *isect,
+        Ray *ray,
+        float3 *throughput)
+{
+#ifdef __VOLUME__
+	/* Attenuation between last surface and next surface. */
+	if(volume_state->volume_stack[0].shader != SHADER_NONE) {
+		Ray segment_ray = *ray;
+		segment_ray.t = isect->t;
+		kernel_volume_shadow(kg,
+		                     shadow_sd,
+		                     volume_state,
+		                     &segment_ray,
+		                     throughput);
+	}
+#endif
+	/* Setup shader data at surface. */
+	shader_setup_from_ray(kg, shadow_sd, isect, ray);
+	/* Attenuation from transparent surface. */
+	if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+		path_state_modify_bounce(state, true);
+		shader_eval_surface(kg,
+		                    shadow_sd,
+		                    NULL,
+		                    state,
+		                    0.0f,
+		                    PATH_RAY_SHADOW,
+		                    SHADER_CONTEXT_SHADOW);
+		path_state_modify_bounce(state, false);
+		*throughput *= shader_bsdf_transparency(kg, shadow_sd);
+	}
+	/* Stop if all light is blocked. */
+	if(is_zero(*throughput)) {
+		return true;
+	}
+#ifdef __VOLUME__
+	/* Exit/enter volume. */
+	kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
+#endif
+	return false;
+}
+
+/* Special version which only handles opaque shadows. */
+ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
+                                      ShaderData *shadow_sd,
+                                      ccl_addr_space PathState *state,
+                                      Ray *ray,
+                                      Intersection *isect,
+                                      float3 *shadow)
+{
+	const bool blocked = scene_intersect(kg,
+	                                     *ray,
+	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     isect,
+	                                     NULL,
+	                                     0.0f, 0.0f);
+#ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* Apply attenuation from current volume shader. */
+		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+	}
+#endif
+	return blocked;
+}
 
-/* Shadow function to compute how much light is blocked, CPU variation.
+#ifdef __TRANSPARENT_SHADOWS__
+#  ifdef __SHADOW_RECORD_ALL__
+/* Shadow function to compute how much light is blocked,
  *
  * We trace a single ray. If it hits any opaque surface, or more than a given
  * number of transparent surfaces is hit, then we consider the geometry to be
@@ -36,261 +111,403 @@ CCL_NAMESPACE_BEGIN
  * or there is a performance increase anyway due to avoiding the need to send
  * two rays with transparent shadows.
  *
- * This is CPU only because of qsort, and malloc or high stack space usage to
- * record all these intersections. */
+ * On CPU it'll handle all transparent bounces (by allocating storage for
+ * intersections when they don't fit into the stack storage).
+ *
+ * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
+ * is something to be kept an eye on.
+ */
 
-#define STACK_MAX_HITS 64
+#    define SHADOW_STACK_MAX_HITS 64
 
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow)
+/* Actual logic with traversal loop implementation which is free from device
+ * specific tweaks.
+ *
+ * Note that hits array should be as big as max_hits+1.
+ */
+ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
+                                                    ShaderData *shadow_sd,
+                                                    ccl_addr_space PathState *state,
+                                                    const int skip_object,
+                                                    Ray *ray,
+                                                    Intersection *hits,
+                                                    uint max_hits,
+                                                    float3 *shadow)
 {
-	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray->t == 0.0f)
-		return false;
-	
-	bool blocked;
-
-	if(kernel_data.integrator.transparent_shadows) {
-		/* check transparent bounces here, for volume scatter which can do
-		 * lighting before surface path termination is checked */
-		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
-			return true;
-
-		/* intersect to find an opaque surface, or record all transparent surface hits */
-		Intersection hits_stack[STACK_MAX_HITS];
-		Intersection *hits = hits_stack;
-		const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-		uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-
-		/* prefer to use stack but use dynamic allocation if too deep max hits
-		 * we need max_hits + 1 storage space due to the logic in
-		 * scene_intersect_shadow_all which will first store and then check if
-		 * the limit is exceeded */
-		if(max_hits + 1 > STACK_MAX_HITS) {
-			if(kg->transparent_shadow_intersections == NULL) {
-				kg->transparent_shadow_intersections =
-				    (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+	/* Intersect to find an opaque surface, or record all transparent
+	 * surface hits.
+	 */
+	uint num_hits;
+	const bool blocked = scene_intersect_shadow_all(kg,
+	                                                ray,
+	                                                hits,
+	                                                skip_object,
+	                                                max_hits,
+	                                                &num_hits);
+	/* If no opaque surface found but we did find transparent hits,
+	 * shade them.
+	 */
+	if(!blocked && num_hits > 0) {
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+		float3 Pend = ray->P + ray->D*ray->t;
+		float last_t = 0.0f;
+		int bounce = state->transparent_bounce;
+		Intersection *isect = hits;
+#    ifdef __VOLUME__
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#      else
+		PathState ps_object;
+		PathState *ps = &ps_object;
+#      endif
+		*ps = *state;
+#    endif
+		sort_intersections(hits, num_hits);
+		for(int hit = 0; hit < num_hits; hit++, isect++) {
+			/* Adjust intersection distance for moving ray forward. */
+			float new_t = isect->t;
+			isect->t -= last_t;
+			/* Skip hit if we did not move forward, step by step raytracing
+			 * would have skipped it as well then.
+			 */
+			if(last_t == new_t) {
+				continue;
 			}
-			hits = kg->transparent_shadow_intersections;
-		}
-
-		uint num_hits;
-		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
-
-		/* if no opaque surface found but we did find transparent hits, shade them */
-		if(!blocked && num_hits > 0) {
-			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-			float3 Pend = ray->P + ray->D*ray->t;
-			float last_t = 0.0f;
-			int bounce = state->transparent_bounce;
-			Intersection *isect = hits;
+			last_t = new_t;
+			/* Attenuate the throughput. */
+			if(shadow_handle_transparent_isect(kg,
+			                                   shadow_sd,
+			                                   state,
 #ifdef __VOLUME__
-			PathState ps = *state;
+			                                   ps,
 #endif
-
-			qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-			for(int hit = 0; hit < num_hits; hit++, isect++) {
-				/* adjust intersection distance for moving ray forward */
-				float new_t = isect->t;
-				isect->t -= last_t;
-
-				/* skip hit if we did not move forward, step by step raytracing
-				 * would have skipped it as well then */
-				if(last_t == new_t)
-					continue;
-
-				last_t = new_t;
-
-#ifdef __VOLUME__
-				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NONE) {
-					Ray segment_ray = *ray;
-					segment_ray.t = isect->t;
-					kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
-				}
-#endif
-
-				/* setup shader data at surface */
-				shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
-				/* attenuation from transparent surface */
-				if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-					path_state_modify_bounce(state, true);
-					shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					path_state_modify_bounce(state, false);
-
-					throughput *= shader_bsdf_transparency(kg, shadow_sd);
-				}
-
-				/* stop if all light is blocked */
-				if(is_zero(throughput)) {
-					return true;
-				}
-
-				/* move ray forward */
-				ray->P = shadow_sd->P;
-				if(ray->t != FLT_MAX) {
-					ray->D = normalize_len(Pend - ray->P, &ray->t);
-				}
-
-#ifdef __VOLUME__
-				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
-#endif
-
-				bounce++;
+			                                   isect,
+			                                   ray,
+			                                   &throughput))
+			{
+				return true;
 			}
-
-#ifdef __VOLUME__
-			/* attenuation for last line segment towards light */
-			if(ps.volume_stack[0].shader != SHADER_NONE)
-				kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
-#endif
-
-			*shadow = throughput;
-
-			return is_zero(throughput);
+			/* Move ray forward. */
+			ray->P = shadow_sd->P;
+			if(ray->t != FLT_MAX) {
+				ray->D = normalize_len(Pend - ray->P, &ray->t);
+			}
+			bounce++;
 		}
+#    ifdef __VOLUME__
+		/* Attenuation for last line segment towards light. */
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
+		}
+#    endif
+		*shadow = throughput;
+		return is_zero(throughput);
 	}
-	else {
-		Intersection isect;
-		blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
-	}
-
-#ifdef __VOLUME__
+#    ifdef __VOLUME__
 	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* apply attenuation from current volume shader */
+		/* Apply attenuation from current volume shader/ */
 		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
 	}
-#endif
-
+#    endif
 	return blocked;
 }
 
-#undef STACK_MAX_HITS
-
-#else
+/* Here we do all device specific trickery before invoking actual traversal
+ * loop to help readability of the actual logic.
+ */
+ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
+                                               ShaderData *shadow_sd,
+                                               ccl_addr_space PathState *state,
+                                               const int skip_object,
+                                               Ray *ray,
+                                               uint max_hits,
+                                               float3 *shadow)
+{
+#    ifdef __SPLIT_KERNEL__
+	Intersection hits_[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = &hits_[0];
+#    elif defined(__KERNEL_CUDA__)
+	Intersection *hits = kg->hits_stack;
+#    else
+	Intersection hits_stack[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = hits_stack;
+#    endif
+#    ifndef __KERNEL_GPU__
+	/* Prefer to use stack but use dynamic allocation if too deep max hits
+	 * we need max_hits + 1 storage space due to the logic in
+	 * scene_intersect_shadow_all which will first store and then check if
+	 * the limit is exceeded.
+	 *
+	 * Ignore this on GPU because of slow/unavailable malloc().
+	 */
+	if(max_hits + 1 > SHADOW_STACK_MAX_HITS) {
+		if(kg->transparent_shadow_intersections == NULL) {
+			const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+			kg->transparent_shadow_intersections =
+				(Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+		}
+		hits = kg->transparent_shadow_intersections;
+	}
+#    endif  /* __KERNEL_GPU__ */
+	/* Invoke actual traversal. */
+	return shadow_blocked_transparent_all_loop(kg,
+	                                           shadow_sd,
+	                                           state,
+	                                           skip_object,
+	                                           ray,
+	                                           hits,
+	                                           max_hits,
+	                                           shadow);
+}
+#  endif  /* __SHADOW_RECORD_ALL__ */
 
-/* Shadow function to compute how much light is blocked, GPU variation.
+#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
+/* Shadow function to compute how much light is blocked,
  *
  * Here we raytrace from one transparent surface to the next step by step.
  * To minimize overhead in cases where we don't need transparent shadows, we
  * first trace a regular shadow ray. We check if the hit primitive was
  * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency. */
+ * one extra ray cast for the cases were we do want transparency.
+ */
 
-ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
-                                        ShaderData *shadow_sd,
-                                        ccl_addr_space PathState *state,
-                                        ccl_addr_space Ray *ray_input,
-                                        float3 *shadow)
+/* This function is only implementing device-independent traversal logic
+ * which requires some precalculation done.
+ */
+ccl_device bool shadow_blocked_transparent_stepped_loop(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+        const int skip_object,
+        Ray *ray,
+        Intersection *isect,
+        const bool blocked,
+        const bool is_transparent_isect,
+        float3 *shadow)
 {
-	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray_input->t == 0.0f)
-		return false;
-
-#ifdef __SPLIT_KERNEL__
-	Ray private_ray = *ray_input;
-	Ray *ray = &private_ray;
-#else
-	Ray *ray = ray_input;
-#endif
-
-#ifdef __SPLIT_KERNEL__
-	Intersection *isect = &kg->isect_shadow[SD_THREAD];
-#else
-	Intersection isect_object;
-	Intersection *isect = &isect_object;
-#endif
-
-	bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
-
-#ifdef __TRANSPARENT_SHADOWS__
-	if(blocked && kernel_data.integrator.transparent_shadows) {
-		if(shader_transparent_shadow(kg, isect)) {
-			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-			float3 Pend = ray->P + ray->D*ray->t;
-			int bounce = state->transparent_bounce;
-#ifdef __VOLUME__
-			PathState ps = *state;
-#endif
-
-			for(;;) {
-				if(bounce >= kernel_data.integrator.transparent_max_bounce)
-					return true;
-
-				if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
-				{
-#ifdef __VOLUME__
-					/* attenuation for last line segment towards light */
-					if(ps.volume_stack[0].shader != SHADER_NONE)
-						kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
-#endif
-
-					*shadow *= throughput;
-
-					return false;
-				}
-
-				if(!shader_transparent_shadow(kg, isect)) {
-					return true;
-				}
-
-#ifdef __VOLUME__
-				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NONE) {
-					Ray segment_ray = *ray;
-					segment_ray.t = isect->t;
-					kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
+	if((blocked && is_transparent_isect) || skip_object != OBJECT_NONE) {
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+		float3 Pend = ray->P + ray->D*ray->t;
+		int bounce = state->transparent_bounce;
+#    ifdef __VOLUME__
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#      else
+		PathState ps_object;
+		PathState *ps = &ps_object;
+#      endif
+		*ps = *state;
+#    endif
+		for(;;) {
+			if(bounce >= kernel_data.integrator.transparent_max_bounce) {
+				return true;
+			}
+			if(!scene_intersect(kg,
+			                    *ray,
+			                    PATH_RAY_SHADOW_TRANSPARENT,
+			                    isect,
+			                    NULL,
+			                    0.0f, 0.0f))
+			{
+				break;
+			}
+#ifdef __SHADOW_TRICKS__
+			if(skip_object != OBJECT_NONE) {
+				const int isect_object = (isect->object == PRIM_NONE)
+				        ? kernel_tex_fetch(__prim_object, isect->prim)
+				        : isect->object;
+				if(isect_object == skip_object) {
+					shader_setup_from_ray(kg, shadow_sd, isect, ray);
+					/* Move ray forward. */
+					ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
+					if(ray->t != FLT_MAX) {
+						ray->D = normalize_len(Pend - ray->P, &ray->t);
+					}
+					bounce++;
+					continue;
 				}
+			}
 #endif
-
-				/* setup shader data at surface */
-				shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
-				/* attenuation from transparent surface */
-				if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) {
-					path_state_modify_bounce(state, true);
-					shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					path_state_modify_bounce(state, false);
-
-					throughput *= shader_bsdf_transparency(kg, shadow_sd);
-				}
-
-				/* stop if all light is blocked */
-				if(is_zero(throughput)) {
-					return true;
-				}
-
-				/* move ray forward */
-				ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
-				if(ray->t != FLT_MAX) {
-					ray->D = normalize_len(Pend - ray->P, &ray->t);
-				}
-
+			if(!shader_transparent_shadow(kg, isect)) {
+				return true;
+			}
+			/* Attenuate the throughput. */
+			if(shadow_handle_transparent_isect(kg,
+			                                   shadow_sd,
+			                                   state,
 #ifdef __VOLUME__
-				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
+			                                   ps,
 #endif
-
-				bounce++;
+			                                   isect,
+			                                   ray,
+			                                   &throughput))
+			{
+				return true;
 			}
+			/* Move ray forward. */
+			ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
+			if(ray->t != FLT_MAX) {
+				ray->D = normalize_len(Pend - ray->P, &ray->t);
+			}
+			bounce++;
+		}
+#    ifdef __VOLUME__
+		/* Attenuation for last line segment towards light. */
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
 		}
+#    endif
+		*shadow *= throughput;
+		return is_zero(throughput);
 	}
-#ifdef __VOLUME__
-	else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* apply attenuation from current volume shader */
+#    ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* Apply attenuation from current volume shader. */
 		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
 	}
-#endif
-#endif
-
+#    endif
 	return blocked;
 }
 
+ccl_device bool shadow_blocked_transparent_stepped(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+        const int skip_object,
+        Ray *ray,
+        Intersection *isect,
+        float3 *shadow)
+{
+	bool blocked, is_transparent_isect;
+	if (skip_object == OBJECT_NONE) {
+		blocked = scene_intersect(kg,
+		                          *ray,
+		                          PATH_RAY_SHADOW_OPAQUE,
+		                          isect,
+		                          NULL,
+		                          0.0f, 0.0f);
+		is_transparent_isect = blocked
+			        ? shader_transparent_shadow(kg, isect)
+			        : false;
+	}
+	else {
+		blocked = false;
+		is_transparent_isect = false;
+	}
+	return shadow_blocked_transparent_stepped_loop(kg,
+	                                               shadow_sd,
+	                                               state,
+	                                               skip_object,
+	                                               ray,
+	                                               isect,
+	                                               blocked,
+	                                               is_transparent_isect,
+	                                               shadow);
+}
+
+#  endif  /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
+#endif /* __TRANSPARENT_SHADOWS__ */
+
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
+                                      ShaderData *shadow_sd,
+                                      ccl_addr_space PathState *state,
+                                      Ray *ray_input,
+                                      float3 *shadow)
+{
+	Ray *ray = ray_input;
+	Intersection isect;
+	/* Some common early checks. */
+	*shadow = make_float3(1.0f, 1.0f, 1.0f);
+	if(ray->t == 0.0f) {
+		return false;
+	}
+#ifdef __SHADOW_TRICKS__
+    const int skip_object = state->catcher_object;
+#else
+    const int skip_object = OBJECT_NONE;
 #endif
+	/* Do actual shadow shading. */
+	/* First of all, we check if integrator requires transparent shadows.
+	 * if not, we use simplest and fastest ever way to calculate occlusion.
+	 *
+	 * NOTE: We can't do quick opaque test here if we are on shadow-catcher
+	 * path because we don't want catcher object to be casting shadow here.
+	 */
+#ifdef __TRANSPARENT_SHADOWS__
+	if(!kernel_data.integrator.transparent_shadows &&
+	   skip_object == OBJECT_NONE)
+#endif
+	{
+		return shadow_blocked_opaque(kg,
+		                             shadow_sd,
+		                             state,
+		                             ray,
+		                             &isect,
+		                             shadow);
+	}
+#ifdef __TRANSPARENT_SHADOWS__
+#  ifdef __SHADOW_RECORD_ALL__
+	/* For the transparent shadows we try to use record-all logic on the
+	 * devices which supports this.
+	 */
+	const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+	/* Check transparent bounces here, for volume scatter which can do
+	 * lighting before surface path termination is checked.
+	 */
+	if(state->transparent_bounce >= transparent_max_bounce) {
+		return true;
+	}
+	const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
+#    ifdef __KERNEL_GPU__
+	/* On GPU we do trickey with tracing opaque ray first, this avoids speed
+	 * regressions in some files.
+	 *
+	 * TODO(sergey): Check why using record-all behavior causes slowdown in such
+	 * cases. Could that be caused by a higher spill pressure?
+	 */
+	const bool blocked = scene_intersect(kg,
+	                                     *ray,
+	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     &isect,
+	                                     NULL,
+	                                     0.0f, 0.0f);
+	const bool is_transparent_isect = blocked
+	        ? shader_transparent_shadow(kg, &isect)
+	        : false;
+	if(!blocked || !is_transparent_isect ||
+	   max_hits + 1 >= SHADOW_STACK_MAX_HITS)
+	{
+		return shadow_blocked_transparent_stepped_loop(kg,
+		                                               shadow_sd,
+		                                               state,
+		                                               skip_object,
+		                                               ray,
+		                                               &isect,
+		                                               blocked,
+		                                               is_transparent_isect,
+		                                               shadow);
+	}
+#    endif  /* __KERNEL_GPU__ */
+	return shadow_blocked_transparent_all(kg,
+	                                      shadow_sd,
+	                                      state,
+	                                      skip_object,
+	                                      ray,
+	                                      max_hits,
+	                                      shadow);
+#  else  /* __SHADOW_RECORD_ALL__ */
+	/* Fallback to a slowest version which works on all devices. */
+	return shadow_blocked_transparent_stepped(kg,
+	                                          shadow_sd,
+	                                          state,
+	                                          skip_object,
+	                                          ray,
+	                                          &isect,
+	                                          shadow);
+#  endif  /* __SHADOW_RECORD_ALL__ */
+#endif  /* __TRANSPARENT_SHADOWS__ */
+}
 
-CCL_NAMESPACE_END
+#undef SHADOW_STACK_MAX_HITS
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index c5652ebf7dc..6c8b7cca4ce 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -201,7 +201,7 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent)
 
 ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
                                            ShaderData *sd,
-                                           PathState *state,
+                                           ccl_addr_space PathState *state,
                                            int state_flag,
                                            float3 *eval,
                                            float3 *N)
@@ -239,7 +239,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
         SubsurfaceIntersection *ss_isect,
         ShaderData *sd,
         ShaderClosure *sc,
-        uint *lcg_state,
+        RNG *lcg_state,
         float disk_u,
         float disk_v,
         bool all)
@@ -293,7 +293,12 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B;
 
 	/* create ray */
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
 	Ray *ray = &ss_isect->ray;
+#endif
 	ray->P = sd->P + disk_N*disk_height + disk_P;
 	ray->D = -disk_N;
 	ray->t = 2.0f*disk_height;
@@ -304,7 +309,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	/* intersect with the same object. if multiple intersections are found it
 	 * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
 	scene_intersect_subsurface(kg,
-	                           ray,
+	                           *ray,
 	                           ss_isect,
 	                           sd->object,
 	                           lcg_state,
@@ -314,20 +319,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	for(int hit = 0; hit < num_eval_hits; hit++) {
 		/* Quickly retrieve P and Ng without setting up ShaderData. */
 		float3 hit_P;
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+		if(sd->type & PRIMITIVE_TRIANGLE) {
 			hit_P = triangle_refine_subsurface(kg,
 			                                   sd,
 			                                   &ss_isect->hits[hit],
 			                                   ray);
 		}
 #ifdef __OBJECT_MOTION__
-		else  if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) {
+		else  if(sd->type & PRIMITIVE_MOTION_TRIANGLE) {
 			float3 verts[3];
 			motion_triangle_vertices(
 			        kg,
-			        ccl_fetch(sd, object),
+			        sd->object,
 			        kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-			        ccl_fetch(sd, time),
+			        sd->time,
 			        verts);
 			hit_P = motion_triangle_refine_subsurface(kg,
 			                                          sd,
@@ -367,6 +372,10 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 		ss_isect->weight[hit] = eval;
 	}
 
+#ifdef __SPLIT_KERNEL__
+	ss_isect->ray = *ray;
+#endif
+
 	return num_eval_hits;
 }
 
@@ -375,13 +384,19 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
         SubsurfaceIntersection* ss_isect,
         int hit,
         ShaderData *sd,
-        PathState *state,
+        ccl_addr_space PathState *state,
         int state_flag,
         ShaderClosure *sc,
         bool all)
 {
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
+	Ray *ray = &ss_isect->ray;
+#endif
 	/* Setup new shading point. */
-	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray);
+	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
 
 	/* Optionally blur colors and bump mapping. */
 	float3 weight = ss_isect->weight[hit];
@@ -392,6 +407,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }
 
+#ifndef __SPLIT_KERNEL__
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
 ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
@@ -448,7 +464,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	/* intersect with the same object. if multiple intersections are
 	 * found it will randomly pick one of them */
 	SubsurfaceIntersection ss_isect;
-	scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1);
+	scene_intersect_subsurface(kg, ray, &ss_isect, sd->object, lcg_state, 1);
 
 	/* evaluate bssrdf */
 	if(ss_isect.num_hits > 0) {
@@ -481,6 +497,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	/* setup diffuse bsdf */
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
+#endif /* ! __SPLIT_KERNEL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 8d5bb75a428..cb1a3f40dee 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -32,6 +32,7 @@ KERNEL_TEX(uint, texture_uint, __prim_visibility)
 KERNEL_TEX(uint, texture_uint, __prim_index)
 KERNEL_TEX(uint, texture_uint, __prim_object)
 KERNEL_TEX(uint, texture_uint, __object_node)
+KERNEL_TEX(float2, texture_float2, __prim_time)
 
 /* objects */
 KERNEL_TEX(float4, texture_float4, __objects)
@@ -177,7 +178,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
 
 #  else
 /* bindless textures */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 8c271c75e44..19c91248922 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -17,9 +17,9 @@
 #ifndef __KERNEL_TYPES_H__
 #define __KERNEL_TYPES_H__
 
-#include "kernel_math.h"
-#include "svm/svm_types.h"
-#include "util_static_assert.h"
+#include "kernel/kernel_math.h"
+#include "kernel/svm/svm_types.h"
+#include "util/util_static_assert.h"
 
 #ifndef __KERNEL_GPU__
 #  define __KERNEL_CPU__
@@ -56,6 +56,8 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE		16
 
+#define WORK_POOL_SIZE 64
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
@@ -63,27 +65,34 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
+#  ifndef __SPLIT_KERNEL__
+#    define __BRANCHED_PATH__
+#  endif
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
 #  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
-#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
-#  define __VOLUME_RECORD_ALL__
+#  ifndef __SPLIT_KERNEL__
+#    define __VOLUME_DECOUPLED__
+#    define __VOLUME_RECORD_ALL__
+#  endif
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SUBSURFACE__
-#  define __CMJ__
+#  define __SHADOW_RECORD_ALL__
+#  ifndef __SPLIT_KERNEL__
+#    define __BRANCHED_PATH__
+#    define __CMJ__
+#  endif
 #endif  /* __KERNEL_CUDA__ */
 
 #ifdef __KERNEL_OPENCL__
@@ -93,6 +102,10 @@ CCL_NAMESPACE_BEGIN
 #  ifdef __KERNEL_OPENCL_NVIDIA__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __SUBSURFACE__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
 #    ifdef __KERNEL_EXPERIMENTAL__
 #      define __CMJ__
 #    endif
@@ -114,6 +127,10 @@ CCL_NAMESPACE_BEGIN
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __SUBSURFACE__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
@@ -140,6 +157,7 @@ CCL_NAMESPACE_BEGIN
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
 #define __PATCH_EVAL__
+#define __SHADOW_TRICKS__
 
 #ifdef __KERNEL_SHADING__
 #  define __SVM__
@@ -195,6 +213,9 @@ CCL_NAMESPACE_BEGIN
 #ifdef __NO_TRANSPARENT__
 #  undef __TRANSPARENT_SHADOWS__
 #endif
+#ifdef __NO_SHADOW_TRICKS__
+#undef __SHADOW_TRICKS__
+#endif
 
 /* Random Numbers */
 
@@ -299,6 +320,8 @@ enum PathRayFlag {
 	PATH_RAY_MIS_SKIP = 4096,
 	PATH_RAY_DIFFUSE_ANCESTOR = 8192,
 	PATH_RAY_SINGLE_PASS_DONE = 16384,
+	PATH_RAY_SHADOW_CATCHER = 32768,
+	PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
 };
 
 /* Closure Label */
@@ -428,6 +451,20 @@ typedef ccl_addr_space struct PathRadiance {
 	float4 shadow;
 	float mist;
 #endif
+
+#ifdef __SHADOW_TRICKS__
+	/* Total light reachable across the path, ignoring shadow blocked queries. */
+	float3 path_total;
+	/* Total light reachable across the path with shadow blocked queries
+	 * applied here.
+	 *
+	 * Dividing this figure by path_total will give estimate of shadow pass.
+	 */
+	float3 path_total_shaded;
+
+	/* Color of the background on which shadow is alpha-overed. */
+	float3 shadow_color;
+#endif
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -443,6 +480,9 @@ typedef struct BsdfEval {
 	float3 subsurface;
 	float3 scatter;
 #endif
+#ifdef __SHADOW_TRICKS__
+	float3 sum_no_mis;
+#endif
 } BsdfEval;
 
 /* Shader Flag */
@@ -536,7 +576,7 @@ typedef struct Ray {
 
 /* Intersection */
 
-typedef ccl_addr_space struct Intersection {
+typedef struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
@@ -788,108 +828,89 @@ enum ShaderDataObjectFlag {
 	SD_OBJECT_INTERSECTS_VOLUME      = (1 << 5),
 	/* Has position for motion vertices. */
 	SD_OBJECT_HAS_VERTEX_MOTION      = (1 << 6),
+	/* object is used to catch shadows */
+	SD_OBJECT_SHADOW_CATCHER         = (1 << 7),
 
 	SD_OBJECT_FLAGS = (SD_OBJECT_HOLDOUT_MASK |
 	                   SD_OBJECT_MOTION |
 	                   SD_OBJECT_TRANSFORM_APPLIED |
 	                   SD_OBJECT_NEGATIVE_SCALE_APPLIED |
 	                   SD_OBJECT_HAS_VOLUME |
-	                   SD_OBJECT_INTERSECTS_VOLUME)
+	                   SD_OBJECT_INTERSECTS_VOLUME |
+	                   SD_OBJECT_SHADOW_CATCHER)
 };
 
-#ifdef __SPLIT_KERNEL__
-#  define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0))
-#  if !defined(__SPLIT_KERNEL_SOA__)
-     /* ShaderData is stored as an Array-of-Structures */
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (s[SD_THREAD].soa_##t)
-#    define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index])
-#  else
-     /* ShaderData is stored as an Structure-of-Arrays */
-#    define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1))
-#    define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t)
-#    define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0)
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) +  SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t)
-#    define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index])
-#  endif
-#else
-#  define ccl_soa_member(type, name) type name
-#  define ccl_fetch(s, t) (s->t)
-#  define ccl_fetch_array(s, t, index) (&s->t[index])
-#endif
-
 typedef ccl_addr_space struct ShaderData {
 	/* position */
-	ccl_soa_member(float3, P);
+	float3 P;
 	/* smooth normal for shading */
-	ccl_soa_member(float3, N);
+	float3 N;
 	/* true geometric normal */
-	ccl_soa_member(float3, Ng);
+	float3 Ng;
 	/* view/incoming direction */
-	ccl_soa_member(float3, I);
+	float3 I;
 	/* shader id */
-	ccl_soa_member(int, shader);
+	int shader;
 	/* booleans describing shader, see ShaderDataFlag */
-	ccl_soa_member(int, flag);
+	int flag;
 	/* booleans describing object of the shader, see ShaderDataObjectFlag */
-	ccl_soa_member(int, object_flag);
+	int object_flag;
 
 	/* primitive id if there is one, ~0 otherwise */
-	ccl_soa_member(int, prim);
+	int prim;
 
 	/* combined type and curve segment for hair */
-	ccl_soa_member(int, type);
+	int type;
 
 	/* parametric coordinates
 	 * - barycentric weights for triangles */
-	ccl_soa_member(float, u);
-	ccl_soa_member(float, v);
+	float u;
+	float v;
 	/* object id if there is one, ~0 otherwise */
-	ccl_soa_member(int, object);
+	int object;
 
 	/* motion blur sample time */
-	ccl_soa_member(float, time);
+	float time;
 
 	/* length of the ray being shaded */
-	ccl_soa_member(float, ray_length);
+	float ray_length;
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differential of P. these are orthogonal to Ng, not N */
-	ccl_soa_member(differential3, dP);
+	differential3 dP;
 	/* differential of I */
-	ccl_soa_member(differential3, dI);
+	differential3 dI;
 	/* differential of u, v */
-	ccl_soa_member(differential, du);
-	ccl_soa_member(differential, dv);
+	differential du;
+	differential dv;
 #endif
 #ifdef __DPDU__
 	/* differential of P w.r.t. parametric coordinates. note that dPdu is
 	 * not readily suitable as a tangent for shading on triangles. */
-	ccl_soa_member(float3, dPdu);
-	ccl_soa_member(float3, dPdv);
+	float3 dPdu;
+	float3 dPdv;
 #endif
 
 #ifdef __OBJECT_MOTION__
 	/* object <-> world space transformations, cached to avoid
 	 * re-interpolating them constantly for shading */
-	ccl_soa_member(Transform, ob_tfm);
-	ccl_soa_member(Transform, ob_itfm);
+	Transform ob_tfm;
+	Transform ob_itfm;
 #endif
 
 	/* Closure data, we store a fixed array of closures */
-	ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]);
-	ccl_soa_member(int, num_closure);
-	ccl_soa_member(int, num_closure_extra);
-	ccl_soa_member(float, randb_closure);
-	ccl_soa_member(float3, svm_closure_weight);
+	struct ShaderClosure closure[MAX_CLOSURE];
+	int num_closure;
+	int num_closure_extra;
+	float randb_closure;
+	float3 svm_closure_weight;
 
 	/* LCG state for closures that require additional random numbers. */
-	ccl_soa_member(uint, lcg_state);
+	uint lcg_state;
 
 	/* ray start position, only set for backgrounds */
-	ccl_soa_member(float3, ray_P);
-	ccl_soa_member(differential3, ray_dP);
+	float3 ray_P;
+	differential3 ray_dP;
 
 #ifdef __OSL__
 	struct KernelGlobals *osl_globals;
@@ -935,12 +956,16 @@ typedef struct PathState {
 	RNG rng_congruential;
 	VolumeStack volume_stack[VOLUME_STACK_SIZE];
 #endif
+
+#ifdef __SHADOW_TRICKS__
+	int catcher_object;
+#endif
 } PathState;
 
 /* Subsurface */
 
 /* Struct to gather multiple SSS hits. */
-struct SubsurfaceIntersection
+typedef struct SubsurfaceIntersection
 {
 	Ray ray;
 	float3 weight[BSSRDF_MAX_HITS];
@@ -948,10 +973,10 @@ struct SubsurfaceIntersection
 	int num_hits;
 	struct Intersection hits[BSSRDF_MAX_HITS];
 	float3 Ng[BSSRDF_MAX_HITS];
-};
+} SubsurfaceIntersection;
 
 /* Struct to gather SSS indirect rays and delay tracing them. */
-struct SubsurfaceIndirectRays
+typedef struct SubsurfaceIndirectRays
 {
 	bool need_update_volume_stack;
 	bool tracing;
@@ -962,7 +987,7 @@ struct SubsurfaceIndirectRays
 	struct Ray rays[BSSRDF_MAX_HITS];
 	float3 throughputs[BSSRDF_MAX_HITS];
 	struct PathRadiance L[BSSRDF_MAX_HITS];
-};
+} SubsurfaceIndirectRays;
 
 /* Constant Kernel Data
  *
@@ -1201,7 +1226,8 @@ typedef struct KernelBVH {
 	int have_curves;
 	int have_instancing;
 	int use_qbvh;
-	int pad1, pad2;
+	int use_bvh_steps;
+	int pad1;
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
@@ -1296,20 +1322,19 @@ enum QueueNumber {
 #define RAY_STATE_MASK 0x007
 #define RAY_FLAG_MASK 0x0F8
 enum RayState {
+	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
-	RAY_ACTIVE = 0,
+	RAY_ACTIVE,
 	/* Denotes ray has completed processing all samples and is inactive. */
-	RAY_INACTIVE = 1,
+	RAY_INACTIVE,
 	/* Denoted ray has exited path-iteration and needs to update output buffer. */
-	RAY_UPDATE_BUFFER = 2,
+	RAY_UPDATE_BUFFER,
 	/* Donotes ray has hit background */
-	RAY_HIT_BACKGROUND = 3,
+	RAY_HIT_BACKGROUND,
 	/* Denotes ray has to be regenerated */
-	RAY_TO_REGENERATE = 4,
+	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
-	RAY_REGENERATED = 5,
-	/* Denotes ray should skip direct lighting */
-	RAY_SKIP_DL = 6,
+	RAY_REGENERATED,
 	/* Flag's ray has to execute shadow blocked function in AO part */
 	RAY_SHADOW_RAY_CAST_AO = 16,
 	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index c7cb29b5af2..9c0878249d4 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -38,7 +38,7 @@ typedef struct VolumeShaderCoefficients {
 /* evaluate shader to get extinction coefficient at P */
 ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
                                                        ShaderData *sd,
-                                                       PathState *state,
+                                                       ccl_addr_space PathState *state,
                                                        float3 P,
                                                        float3 *extinction)
 {
@@ -64,7 +64,7 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
 /* evaluate shader to get absorption, scattering and emission at P */
 ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
                                             ShaderData *sd,
-                                            PathState *state,
+                                            ccl_addr_space PathState *state,
                                             float3 P,
                                             VolumeShaderCoefficients *coeff)
 {
@@ -112,7 +112,7 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel)
 	return (channel == 0)? value.x: ((channel == 1)? value.y: value.z);
 }
 
-ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack)
+ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
 {
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
 		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE);
@@ -161,7 +161,11 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 
 /* homogeneous volume: assume shader evaluation at the starts gives
  * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
+ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
+                                                 ccl_addr_space PathState *state,
+                                                 Ray *ray,
+                                                 ShaderData *sd,
+                                                 float3 *throughput)
 {
 	float3 sigma_t;
 
@@ -171,7 +175,11 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
 
 /* heterogeneous volume: integrate stepping through the volume until we
  * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
+ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
+                                                   ccl_addr_space PathState *state,
+                                                   Ray *ray,
+                                                   ShaderData *sd,
+                                                   float3 *throughput)
 {
 	float3 tp = *throughput;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -179,7 +187,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
 	float step = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step;
+	float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step;
 
 	/* compute extinction at the start */
 	float t = 0.0f;
@@ -193,7 +201,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 		/* use random position inside this segment to sample shader */
 		if(new_t == ray->t)
-			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+			random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * dt;
 
 		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
 		float3 sigma_t;
@@ -227,7 +235,11 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 /* get the volume attenuation over line segment defined by ray, with the
  * assumption that there are no surfaces blocking light between the endpoints */
-ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *throughput)
+ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
+                                              ShaderData *shadow_sd,
+                                              ccl_addr_space PathState *state,
+                                              Ray *ray,
+                                              float3 *throughput)
 {
 	shader_setup_from_volume(kg, shadow_sd, ray);
 
@@ -341,9 +353,15 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
 
 /* homogeneous volume: assume shader evaluation at the start gives
  * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
-	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
-	RNG *rng, bool probalistic_scatter)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    Ray *ray,
+    ShaderData *sd,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    RNG *rng,
+    bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
 
@@ -444,8 +462,14 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
  * volume until we reach the end, get absorbed entirely, or run out of
  * iterations. this does probabilistically scatter or get transmitted through
  * for path tracing where we don't want to branch. */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
-	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    Ray *ray,
+    ShaderData *sd,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    RNG *rng)
 {
 	float3 tp = *throughput;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -453,7 +477,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
 	float step_size = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
+	float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step_size;
 
 	/* compute coefficients at the start */
 	float t = 0.0f;
@@ -474,7 +498,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 
 		/* use random position inside this segment to sample shader */
 		if(new_t == ray->t)
-			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+			random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * dt;
 
 		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
 		VolumeShaderCoefficients coeff;
@@ -579,8 +603,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
  * ray, with the assumption that there are no surfaces blocking light
  * between the endpoints. distance sampling is used to decide if we will
  * scatter or not. */
-ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
-	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    ShaderData *sd,
+    Ray *ray,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    RNG *rng,
+    bool heterogeneous)
 {
 	shader_setup_from_volume(kg, sd, ray);
 
@@ -590,6 +621,7 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
 		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true);
 }
 
+#ifndef __SPLIT_KERNEL__
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
@@ -966,7 +998,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
 		}
 	}
-	if(sample_t < 1e-6f) {
+	if(sample_t < 1e-6f || pdf == 0.0f) {
 		return VOLUME_PATH_SCATTERED;
 	}
 
@@ -990,6 +1022,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 
 	return VOLUME_PATH_SCATTERED;
 }
+#endif /* __SPLIT_KERNEL */
 
 /* decide if we need to use decoupled or not */
 ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
@@ -1021,9 +1054,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou
 
 ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
                                          ShaderData *stack_sd,
-                                         const PathState *state,
-                                         const Ray *ray,
-                                         VolumeStack *stack)
+                                         ccl_addr_space const PathState *state,
+                                         ccl_addr_space const Ray *ray,
+                                         ccl_addr_space VolumeStack *stack)
 {
 	/* NULL ray happens in the baker, does it need proper initialization of
 	 * camera in volume?
@@ -1166,7 +1199,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 	}
 }
 
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack)
+ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, ccl_addr_space VolumeStack *stack)
 {
 	/* todo: we should have some way for objects to indicate if they want the
 	 * world shader to work inside them. excluding it by default is problematic
@@ -1215,7 +1248,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
                                                           ShaderData *stack_sd,
                                                           Ray *ray,
-                                                          VolumeStack *stack)
+                                                          ccl_addr_space VolumeStack *stack)
 {
 	kernel_assert(kernel_data.integrator.use_volumes);
 
@@ -1277,7 +1310,7 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
  * the world's one after the last bounce to avoid render artifacts.
  */
 ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
-                                                 VolumeStack *volume_stack)
+                                                 ccl_addr_space VolumeStack *volume_stack)
 {
 	if(kernel_data.background.volume_shader != SHADER_NONE) {
 		/* Keep the world's volume in stack. */
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 7d559b1aa31..28fc5ce1c30 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -17,177 +17,102 @@
 #ifndef __KERNEL_WORK_STEALING_H__
 #define __KERNEL_WORK_STEALING_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Utility functions for work stealing
  */
 
-#ifdef __WORK_STEALING__
-
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-uint get_group_id_with_ray_index(uint ray_index,
-                                 uint tile_dim_x,
-                                 uint tile_dim_y,
-                                 uint parallel_samples,
-                                 int dim)
+ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
+{
+	return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
+}
+
+ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
+{
+	return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
+{
+	return ray_index / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
 {
-	if(dim == 0) {
-		uint x_span = ray_index % (tile_dim_x * parallel_samples);
-		return x_span / get_local_size(0);
+	uint total_work_size = kernel_total_work_size(kg);
+	uint num_pools = kernel_num_work_pools(kg);
+
+	if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
+		return 0;
+	}
+
+	uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
+
+	uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
+	if(work_pool < remainder / WORK_POOL_SIZE) {
+		work_size += WORK_POOL_SIZE;
 	}
-	else /*if(dim == 1)*/ {
-		kernel_assert(dim == 1);
-		uint y_span = ray_index / (tile_dim_x * parallel_samples);
-		return y_span / get_local_size(1);
+	else if(work_pool == remainder / WORK_POOL_SIZE) {
+		work_size += remainder % WORK_POOL_SIZE;
 	}
+
+	return work_size;
 }
 
-uint get_total_work(uint tile_dim_x,
-                    uint tile_dim_y,
-                    uint grp_idx,
-                    uint grp_idy,
-                    uint num_samples)
+ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
 {
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	return threads_within_tile_border_x *
-	       threads_within_tile_border_y *
-	       num_samples;
+	uint num_pools = kernel_num_work_pools(kg);
+	uint pool = work_pool_from_ray_index(kg, ray_index);
+
+	return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
+	       + (pool * WORK_POOL_SIZE)
+	       + (work_index % WORK_POOL_SIZE);
 }
 
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-int get_next_work(ccl_global uint *work_pool,
-                  ccl_private uint *my_work,
-                  uint tile_dim_x,
-                  uint tile_dim_y,
-                  uint num_samples,
-                  uint parallel_samples,
-                  uint ray_index)
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint total_work = get_total_work(tile_dim_x,
-	                                 tile_dim_y,
-	                                 grp_idx,
-	                                 grp_idy,
-	                                 num_samples);
-	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
-	*my_work = atomic_inc(&work_pool[group_index]);
-	return (*my_work < total_work) ? 1 : 0;
+	uint work_pool = work_pool_from_ray_index(kg, ray_index);
+	uint pool_size = work_pool_work_size(kg, work_pool);
+
+	if(pool_size == 0) {
+		return false;
+	}
+
+	*work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
+	return (*work_index < pool_size);
 }
 
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-uint get_my_sample(uint my_work,
-                   uint tile_dim_x,
-                   uint tile_dim_y,
-                   uint parallel_samples,
-                   uint ray_index)
+/* This function assumes that the passed `work` is valid. */
+/* Decode sample number w.r.t. assigned `work`. */
+ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	return my_work /
-	       (threads_within_tile_border_x * threads_within_tile_border_y);
+	return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
 }
 
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-void get_pixel_tile_position(ccl_private uint *pixel_x,
+/* Decode pixel and tile position w.r.t. assigned `work`. */
+ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
+                             ccl_private uint *pixel_x,
                              ccl_private uint *pixel_y,
                              ccl_private uint *tile_x,
                              ccl_private uint *tile_y,
-                             uint my_work,
-                             uint tile_dim_x,
-                             uint tile_dim_y,
-                             uint tile_offset_x,
-                             uint tile_offset_y,
-                             uint parallel_samples,
+                             uint work_index,
                              uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	uint total_associated_pixels =
-		threads_within_tile_border_x * threads_within_tile_border_y;
-	uint work_group_pixel_index = my_work % total_associated_pixels;
-	uint work_group_pixel_x =
-		work_group_pixel_index % threads_within_tile_border_x;
-	uint work_group_pixel_y =
-		work_group_pixel_index / threads_within_tile_border_x;
-
-	*pixel_x =
-		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
-	*pixel_y =
-		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
-	*tile_x = *pixel_x - tile_offset_x;
-	*tile_y = *pixel_y - tile_offset_y;
+	uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+
+	*tile_x = pixel_index % kernel_split_params.w;
+	*tile_y = pixel_index / kernel_split_params.w;
+
+	*pixel_x = *tile_x + kernel_split_params.x;
+	*pixel_y = *tile_y + kernel_split_params.y;
 }
 
-#endif  /* __WORK_STEALING__ */
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 72dbbd9a416..16992c681e6 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -56,9 +56,9 @@
     /* do nothing */
 #endif
 
-#include "kernel.h"
+#include "kernel/kernel.h"
 #define KERNEL_ARCH cpu
-#include "kernel_cpu_impl.h"
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -90,7 +90,7 @@ void kernel_tex_copy(KernelGlobals *kg,
 		kg->tname.width = width; \
 	}
 #define KERNEL_IMAGE_TEX(type, ttype, tname)
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	else if(strstr(name, "__tex_image_float4")) {
 		texture_image_float4 *tex = NULL;
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 1350d9e5c2e..2600d977972 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -28,10 +28,10 @@
 #  define __KERNEL_AVX__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_avx
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index 1a416e771ee..dba15d037ac 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -29,10 +29,10 @@
 #  define __KERNEL_AVX2__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_avx2
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 1a07c705f1c..896b80d783e 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -49,4 +49,44 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        int offset,
                                        int sample);
 
+/* Split kernels */
+
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+        KernelGlobals *kg,
+        ccl_constant KernelData *data,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        ccl_global uint *rng_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer);
+
+#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data);
+
+DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
+DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
+DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
+DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+
+void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
+
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index ec82d4b4c22..148b2eef568 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -20,18 +20,45 @@
  * simply includes this file without worry of copying actual implementation over.
  */
 
-#include "kernel_compat_cpu.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_cpu_image.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_path_branched.h"
-#include "kernel_bake.h"
+#include "kernel/kernel_compat_cpu.h"
+
+#ifndef __SPLIT_KERNEL__
+#  include "kernel/kernel_math.h"
+#  include "kernel/kernel_types.h"
+
+#  include "kernel/split/kernel_split_data.h"
+#  include "kernel/kernel_globals.h"
+
+#  include "kernel/kernels/cpu/kernel_cpu_image.h"
+#  include "kernel/kernel_film.h"
+#  include "kernel/kernel_path.h"
+#  include "kernel/kernel_path_branched.h"
+#  include "kernel/kernel_bake.h"
+#else
+#  include "kernel/split/kernel_split_common.h"
+
+#  include "kernel/split/kernel_data_init.h"
+#  include "kernel/split/kernel_path_init.h"
+#  include "kernel/split/kernel_scene_intersect.h"
+#  include "kernel/split/kernel_lamp_emission.h"
+#  include "kernel/split/kernel_do_volume.h"
+#  include "kernel/split/kernel_queue_enqueue.h"
+#  include "kernel/split/kernel_indirect_background.h"
+#  include "kernel/split/kernel_shader_eval.h"
+#  include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#  include "kernel/split/kernel_subsurface_scatter.h"
+#  include "kernel/split/kernel_direct_lighting.h"
+#  include "kernel/split/kernel_shadow_blocked_ao.h"
+#  include "kernel/split/kernel_shadow_blocked_dl.h"
+#  include "kernel/split/kernel_next_iteration_setup.h"
+#  include "kernel/split/kernel_indirect_subsurface.h"
+#  include "kernel/split/kernel_buffer_update.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
+#ifndef __SPLIT_KERNEL__
+
 /* Path Tracing */
 
 void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
@@ -131,4 +158,72 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 	}
 }
 
+#else  /* __SPLIT_KERNEL__ */
+
+/* Split Kernel Path Tracing */
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		kernel_##name(kg); \
+	}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		ccl_local type locals; \
+		kernel_##name(kg, &locals); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+
+void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
+{
+#define REGISTER_NAME_STRING(name) #name
+#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
+#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
+
+	REGISTER(path_trace);
+	REGISTER(convert_to_byte);
+	REGISTER(convert_to_half_float);
+	REGISTER(shader);
+
+	REGISTER(data_init);
+	REGISTER(path_init);
+	REGISTER(scene_intersect);
+	REGISTER(lamp_emission);
+	REGISTER(do_volume);
+	REGISTER(queue_enqueue);
+	REGISTER(indirect_background);
+	REGISTER(shader_eval);
+	REGISTER(holdout_emission_blurring_pathtermination_ao);
+	REGISTER(subsurface_scatter);
+	REGISTER(direct_lighting);
+	REGISTER(shadow_blocked_ao);
+	REGISTER(shadow_blocked_dl);
+	REGISTER(next_iteration_setup);
+	REGISTER(indirect_subsurface);
+	REGISTER(buffer_update);
+
+#undef REGISTER
+#undef REGISTER_EVAL_NAME
+#undef REGISTER_NAME_STRING
+}
+
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
new file mode 100644
index 00000000000..ca750e5a00d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
new file mode 100644
index 00000000000..27a746a0799
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#  define __KERNEL_AVX__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_avx
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
new file mode 100644
index 00000000000..364d279a189
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE__
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#  define __KERNEL_AVX__
+#  define __KERNEL_AVX2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_avx2
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
new file mode 100644
index 00000000000..0afb481296f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_sse2
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
new file mode 100644
index 00000000000..13d00813591
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_sse3
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
new file mode 100644
index 00000000000..a4312071edc
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_sse41
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index a5f2d6e7294..1acfaa91ac9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -23,10 +23,10 @@
 #  define __KERNEL_SSE2__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_sse2
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index 86f9ce991f8..f7b6a2e21fe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -25,10 +25,10 @@
 #  define __KERNEL_SSSE3__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_sse3
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index c174406047d..1900c6e3012 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -26,10 +26,10 @@
 #  define __KERNEL_SSE41__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_sse41
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu//kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index eb2b6ea5414..dc343cb387a 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,113 +16,19 @@
 
 /* CUDA kernel entry points */
 
-#include "../../kernel_compat_cuda.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_film.h"
-#include "../../kernel_path.h"
-#include "../../kernel_path_branched.h"
-#include "../../kernel_bake.h"
-
-/* device data taken from CUDA occupancy calculator */
-
 #ifdef __CUDA_ARCH__
 
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 32
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 48
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-	__launch_bounds__( \
-		threads_block_width*threads_block_width, \
-		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
-		)
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
+#include "kernel/kernel_compat_cuda.h"
+#include "kernel_config.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_path.h"
+#include "kernel/kernel_path_branched.h"
+#include "kernel/kernel_bake.h"
 
 /* kernels */
-
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
@@ -130,8 +36,10 @@ kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int s
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+	if(x < sx + sw && y < sy + sh) {
+		KernelGlobals kg;
+		kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+	}
 }
 
 #ifdef __BRANCHED_PATH__
@@ -142,8 +50,10 @@ kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+	if(x < sx + sw && y < sy + sh) {
+		KernelGlobals kg;
+		kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+	}
 }
 #endif
 
@@ -154,8 +64,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
+	if(x < sx + sw && y < sy + sh) {
 		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+	}
 }
 
 extern "C" __global__ void
@@ -165,8 +76,9 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
+	if(x < sx + sw && y < sy + sh) {
 		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+	}
 }
 
 extern "C" __global__ void
@@ -183,7 +95,8 @@ kernel_cuda_shader(uint4 *input,
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	if(x < sx + sw) {
-		kernel_shader_evaluate(NULL,
+		KernelGlobals kg;
+		kernel_shader_evaluate(&kg,
 		                       input,
 		                       output,
 		                       output_luma,
@@ -200,8 +113,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
-	if(x < sx + sw)
-		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+	if(x < sx + sw) {
+		KernelGlobals kg;
+		kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+	}
 }
 #endif
 
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
new file mode 100644
index 00000000000..9fa39dc9ebb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 32
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 48
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
new file mode 100644
index 00000000000..a679eff8409
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "kernel/kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_data_init.h"
+#include "kernel/split/kernel_path_init.h"
+#include "kernel/split/kernel_scene_intersect.h"
+#include "kernel/split/kernel_lamp_emission.h"
+#include "kernel/split/kernel_do_volume.h"
+#include "kernel/split/kernel_queue_enqueue.h"
+#include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_eval.h"
+#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "kernel/split/kernel_subsurface_scatter.h"
+#include "kernel/split/kernel_direct_lighting.h"
+#include "kernel/split/kernel_shadow_blocked_ao.h"
+#include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_next_iteration_setup.h"
+#include "kernel/split/kernel_indirect_subsurface.h"
+#include "kernel/split/kernel_buffer_update.h"
+
+#include "kernel/kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
+{
+	*size = split_data_buffer_size(NULL, num_threads);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        ccl_global uint *rng_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer)
+{
+	kernel_data_init(NULL,
+	                 NULL,
+	                 split_data_buffer,
+	                 num_elements,
+	                 ray_state,
+	                 rng_state,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_pool_wgs,
+	                 num_samples,
+	                 buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		kernel_##name(NULL); \
+	}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		ccl_local type locals; \
+		kernel_##name(NULL, &locals); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index a68f97857b6..078acc1631e 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -16,34 +16,34 @@
 
 /* OpenCL kernel entry points - unfinished */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_image_opencl.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_image_opencl.h"
 
-#include "../../kernel_film.h"
+#include "kernel/kernel_film.h"
 
 #if defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__)
-#  include "../../kernel_path.h"
-#  include "../../kernel_path_branched.h"
+#  include "kernel/kernel_path.h"
+#  include "kernel/kernel_path_branched.h"
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
 /* Include only actually used headers for the case
  * when path tracing kernels are not needed.
  */
-#  include "../../kernel_random.h"
-#  include "../../kernel_differential.h"
-#  include "../../kernel_montecarlo.h"
-#  include "../../kernel_projection.h"
-#  include "../../geom/geom.h"
-#  include "../../bvh/bvh.h"
-
-#  include "../../kernel_accumulate.h"
-#  include "../../kernel_camera.h"
-#  include "../../kernel_shader.h"
+#  include "kernel/kernel_random.h"
+#  include "kernel/kernel_differential.h"
+#  include "kernel/kernel_montecarlo.h"
+#  include "kernel/kernel_projection.h"
+#  include "kernel/geom/geom.h"
+#  include "kernel/bvh/bvh.h"
+
+#  include "kernel/kernel_accumulate.h"
+#  include "kernel/kernel_camera.h"
+#  include "kernel/kernel_shader.h"
 #endif  /* defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) */
 
-#include "../../kernel_bake.h"
+#include "kernel/kernel_bake.h"
 
 #ifdef __COMPILE_ONLY_MEGAKERNEL__
 
@@ -54,7 +54,7 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -65,10 +65,10 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
@@ -84,7 +84,7 @@ __kernel void kernel_ocl_shader(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	int type, int sx, int sw, int offset, int sample)
 {
@@ -94,9 +94,9 @@ __kernel void kernel_ocl_shader(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
 		kernel_shader_evaluate(kg,
@@ -116,7 +116,7 @@ __kernel void kernel_ocl_bake(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	int type, int filter, int sx, int sw, int offset, int sample)
 {
@@ -126,9 +126,9 @@ __kernel void kernel_ocl_bake(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
 #ifdef __NO_BAKING__
@@ -146,7 +146,7 @@ __kernel void kernel_ocl_convert_to_byte(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -157,10 +157,10 @@ __kernel void kernel_ocl_convert_to_byte(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
@@ -173,7 +173,7 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -184,13 +184,29 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset)
+{
+	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	if(i < size / sizeof(float4)) {
+		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	}
+	else if(i == size / sizeof(float4)) {
+		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
+
+		for(i = 0; i < size % sizeof(float4); i++) {
+			*(b++) = 0;
+		}
+	}
+}
+
 #endif  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
deleted file mode 100644
index 1914d241eb1..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_background_buffer_update.h"
-
-__kernel void kernel_ocl_path_trace_background_buffer_update(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        ccl_global int *Queue_data,            /* Queues memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(ray_index == 0) {
-		/* We will empty this queue in this kernel. */
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-	}
-	char enqueue_flag = 0;
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          1);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag =
-			kernel_background_buffer_update((KernelGlobals *)kg,
-			                                per_sample_output_buffers,
-			                                rng_state,
-			                                rng_coop,
-			                                throughput_coop,
-			                                PathRadiance_coop,
-			                                Ray_coop,
-			                                PathState_coop,
-			                                L_transparent_coop,
-			                                ray_state,
-			                                sw, sh, sx, sy, stride,
-			                                rng_state_offset_x,
-			                                rng_state_offset_y,
-			                                rng_state_stride,
-			                                work_array,
-			                                end_sample,
-			                                start_sample,
-#ifdef __WORK_STEALING__
-			                                work_pool_wgs,
-			                                num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-			                                debugdata_coop,
-#endif
-			                                parallel_samples,
-			                                ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	 * These rays will be made active during next SceneIntersectkernel.
-	 */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
new file mode 100644
index 00000000000..db65c91baf7
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_buffer_update.h"
+
+__kernel void kernel_ocl_path_trace_buffer_update(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	ccl_local unsigned int local_queue_atomics;
+	kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 18139687eab..8b85d362f8a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -14,77 +14,49 @@
  * limitations under the License.
  */
 
-#include "split/kernel_data_init.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_data_init.h"
 
 __kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *globals,
-        ccl_global char *sd_DL_shadow,
+        ccl_global char *kg,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
         ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
 
 #define KERNEL_TEX(type, ttype, name)                                   \
         ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
         ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
         unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global float *buffer)
 {
-	kernel_data_init((KernelGlobals *)globals,
-	                 (ShaderData *)sd_DL_shadow,
+	kernel_data_init((KernelGlobals*)kg,
 	                 data,
-	                 per_sample_output_buffers,
-	                 rng_state,
-	                 rng_coop,
-	                 throughput_coop,
-	                 L_transparent_coop,
-	                 PathRadiance_coop,
-	                 Ray_coop,
-	                 PathState_coop,
-	                 Intersection_coop_shadow,
+	                 split_data_buffer,
+	                 num_elements,
 	                 ray_state,
+	                 rng_state,
 
 #define KERNEL_TEX(type, ttype, name) name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	                 start_sample, sx, sy, sw, sh, offset, stride,
-	                 rng_state_offset_x,
-	                 rng_state_offset_y,
-	                 rng_state_stride,
-	                 Queue_data,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
 	                 Queue_index,
 	                 queuesize,
 	                 use_queues_flag,
-	                 work_array,
-#ifdef __WORK_STEALING__
 	                 work_pool_wgs,
 	                 num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-	                 debugdata_coop,
-#endif
-	                 parallel_samples);
+	                 buffer);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index c6a2c8d050c..eb34f750881 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -14,74 +14,14 @@
  * limitations under the License.
  */
 
-#include "split/kernel_direct_lighting.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_direct_lighting.h"
 
 __kernel void kernel_ocl_path_trace_direct_lighting(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                    /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        ccl_global int *Queue_data,             /* Queue memory */
-        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
-        int queuesize)                          /* Size (capacity) of each queue */
+        ccl_constant KernelData *data)
 {
 	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg,
-		                                      (ShaderData *)sd,
-		                                      rng_coop,
-		                                      PathState_coop,
-		                                      ISLamp_coop,
-		                                      LightRay_coop,
-		                                      BSDFEval_coop,
-		                                      ray_state,
-		                                      ray_index);
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-#ifdef __EMISSION__
-	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-#endif
+	kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
new file mode 100644
index 00000000000..83ef5f5f3f2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_do_volume.h"
+
+__kernel void kernel_ocl_path_trace_do_volume(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_do_volume((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index e063614da1a..d071b39aa6f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -14,110 +14,16 @@
  * limitations under the License.
  */
 
-#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 
 __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics_bg;
-	ccl_local unsigned int local_queue_atomics_ao;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics_bg = 0;
-		local_queue_atomics_ao = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif  /* __COMPUTE_DEVICE_GPU__ */
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		kernel_holdout_emission_blurring_pathtermination_ao(
-		        (KernelGlobals *)kg,
-		        (ShaderData *)sd,
-		        per_sample_output_buffers,
-		        rng_coop,
-		        throughput_coop,
-		        L_transparent_coop,
-		        PathRadiance_coop,
-		        PathState_coop,
-		        Intersection_coop,
-		        AOAlpha_coop,
-		        AOBSDF_coop,
-		        AOLightRay_coop,
-		        sw, sh, sx, sy, stride,
-		        ray_state,
-		        work_array,
-#ifdef __WORK_STEALING__
-		        start_sample,
-#endif
-		        parallel_samples,
-		        ray_index,
-		        &enqueue_flag,
-		        &enqueue_flag_AO_SHADOW_RAY_CAST);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics_bg,
-	                        Queue_data,
-	                        Queue_index);
-
-#ifdef __AO__
-	/* Enqueue to-shadow-ray-cast rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-	                        enqueue_flag_AO_SHADOW_RAY_CAST,
-	                        queuesize,
-	                        &local_queue_atomics_ao,
-	                        Queue_data,
-	                        Queue_index);
-#endif
+	ccl_local BackgroundAOLocals locals;
+	kernel_holdout_emission_blurring_pathtermination_ao(
+	        (KernelGlobals*)kg,
+	        &locals);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
new file mode 100644
index 00000000000..8c213ff5cb2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_indirect_background.h"
+
+__kernel void kernel_ocl_path_trace_indirect_background(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_indirect_background((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
new file mode 100644
index 00000000000..998ebc4c0c3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_indirect_subsurface.h"
+
+__kernel void kernel_ocl_path_trace_indirect_subsurface(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_indirect_subsurface((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 267bddc2ffc..822d2287715 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -14,67 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_lamp_emission.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_lamp_emission.h"
 
 __kernel void kernel_ocl_path_trace_lamp_emission(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        ccl_constant KernelData *data)
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	/* We will empty this queue in this kernel. */
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-	}
-	/* Fetch use_queues_flag. */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          1);
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_lamp_emission((KernelGlobals *)kg,
-	                     throughput_coop,
-	                     PathRadiance_coop,
-	                     Ray_coop,
-	                     PathState_coop,
-	                     Intersection_coop,
-	                     ray_state,
-	                     sw, sh,
-	                     use_queues_flag,
-	                     ray_index);
+	kernel_lamp_emission((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d49b6294a8..6d207253a40 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -14,101 +14,14 @@
  * limitations under the License.
  */
 
-#include "split/kernel_next_iteration_setup.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_next_iteration_setup.h"
 
 __kernel void kernel_ocl_path_trace_next_iteration_setup(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                  /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global int *Queue_data,           /* Queue memory */
-        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
-        int queuesize,                        /* Size (capacity) of each queue */
-        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
+        ccl_constant KernelData *data)
 {
 	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		/* If we are here, then it means that scene-intersect kernel
-		* has already been executed atleast once. From the next time,
-		* scene-intersect kernel may operate on queues to fetch ray index
-		*/
-		use_queues_flag[0] = 1;
-
-		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-		 * previous kernel.
-		 */
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-	}
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg,
-		                                           (ShaderData *)sd,
-		                                           rng_coop,
-		                                           throughput_coop,
-		                                           PathRadiance_coop,
-		                                           Ray_coop,
-		                                           PathState_coop,
-		                                           LightRay_dl_coop,
-		                                           ISLamp_coop,
-		                                           BSDFEval_coop,
-		                                           LightRay_ao_coop,
-		                                           AOBSDF_coop,
-		                                           AOAlpha_coop,
-		                                           ray_state,
-		                                           use_queues_flag,
-		                                           ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
+	kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
new file mode 100644
index 00000000000..bd9aa9538c8
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_path_init.h"
+
+__kernel void kernel_ocl_path_trace_path_init(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_path_init((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 3156dc255fb..9be154e3d75 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -14,93 +14,14 @@
  * limitations under the License.
  */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_queues.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_queue_enqueue.h"
 
-/*
- * The kernel "kernel_queue_enqueue" enqueues rays of
- * different ray state into their appropriate Queues;
- * 1. Rays that have been determined to hit the background from the
- * "kernel_scene_intersect" kernel
- * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output of the kernel is as follows,
- *
- * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
- * queuesize -------------------------------------------|                           |
- *
- * Note on Queues :
- * State of queues during the first time this kernel is called :
- * At entry,
- * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
- *
- * State of queue during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
 __kernel void kernel_ocl_path_trace_queue_enqueue(
-        ccl_global int *Queue_data,   /* Queue memory */
-        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
-        ccl_global char *ray_state,   /* Denotes the state of each ray */
-        int queuesize)                /* Size (capacity) of each queue */
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
 {
-	/* We have only 2 cases (Hit/Not-Hit) */
-	ccl_local unsigned int local_queue_atomics[2];
-
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-	if(lidx < 2 ) {
-		local_queue_atomics[lidx] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int queue_number = -1;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-	}
-	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	}
-
-	unsigned int my_lqidx;
-	if(queue_number != -1) {
-		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(lidx == 0) {
-		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
-		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
-		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	unsigned int my_gqidx;
-	if(queue_number != -1) {
-		my_gqidx = get_global_queue_index(queue_number,
-		                                  queuesize,
-		                                  my_lqidx,
-		                                  local_queue_atomics);
-		Queue_data[my_gqidx] = ray_index;
-	}
+	ccl_local QueueEnqueueLocals locals;
+	kernel_queue_enqueue((KernelGlobals*)kg, &locals);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index 7f3f433c7a6..eb4fb4d153a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -14,67 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_scene_intersect.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_scene_intersect.h"
 
 __kernel void kernel_ocl_path_trace_scene_intersect(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        ccl_constant KernelData *data)
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	/* Fetch use_queues_flag */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          0);
-
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_scene_intersect((KernelGlobals *)kg,
-	                       rng_coop,
-	                       Ray_coop,
-	                       PathState_coop,
-	                       Intersection_coop,
-	                       ray_state,
-	                       sw, sh,
-	                       use_queues_flag,
-#ifdef __KERNEL_DEBUG__
-	                       debugdata_coop,
-#endif
-	                       ray_index);
+	kernel_scene_intersect((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index c37856c8f30..6baee460986 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -14,55 +14,14 @@
  * limitations under the License.
  */
 
-#include "split/kernel_shader_eval.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_eval.h"
 
 __kernel void kernel_ocl_path_trace_shader_eval(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global int *Queue_data,            /* queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
+        ccl_constant KernelData *data)
 {
-	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
 	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-
-	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-
-	/* Continue on with shader evaluation. */
-	kernel_shader_eval((KernelGlobals *)kg,
-	                   (ShaderData *)sd,
-	                   rng_coop,
-	                   Ray_coop,
-	                   PathState_coop,
-	                   Intersection_coop,
-	                   ray_state,
-	                   ray_index);
+	kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
deleted file mode 100644
index edf76fba714..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_shadow_blocked.h"
-
-__kernel void kernel_ocl_path_trace_shadow_blocked(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
-{
-	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
-
-	ccl_local unsigned int ao_queue_length;
-	ccl_local unsigned int dl_queue_length;
-	if(lidx == 0) {
-		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	/* flag determining if the current ray is to process shadow ray for AO or DL */
-	char shadow_blocked_type = -1;
-
-	int ray_index = QUEUE_EMPTY_SLOT;
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(thread_index < ao_queue_length + dl_queue_length) {
-		if(thread_index < ao_queue_length) {
-			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
-		} else {
-			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
-		}
-	}
-
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-
-	kernel_shadow_blocked((KernelGlobals *)kg,
-	                      PathState_coop,
-	                      LightRay_dl_coop,
-	                      LightRay_ao_coop,
-	                      ray_state,
-	                      shadow_blocked_type,
-	                      ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
new file mode 100644
index 00000000000..6a8ef81b32a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shadow_blocked_ao.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_shadow_blocked_ao((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
new file mode 100644
index 00000000000..b255cc5ef8b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shadow_blocked_dl.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_shadow_blocked_dl((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
new file mode 100644
index 00000000000..732cda30115
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
+#include "kernel/kernels/opencl/kernel_data_init.cl"
+#include "kernel/kernels/opencl/kernel_path_init.cl"
+
+#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
+#include "kernel/kernels/opencl/kernel_lamp_emission.cl"
+#include "kernel/kernels/opencl/kernel_do_volume.cl"
+#include "kernel/kernels/opencl/kernel_indirect_background.cl"
+#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
+#include "kernel/kernels/opencl/kernel_shader_eval.cl"
+#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
+#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
+#include "kernel/kernels/opencl/kernel_direct_lighting.cl"
+#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
+#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
+#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
+#include "kernel/kernels/opencl/kernel_buffer_update.cl"
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
new file mode 100644
index 00000000000..c10ecc426c6
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+
+__kernel void kernel_ocl_path_trace_state_buffer_size(
+        ccl_global char *kg,
+        ccl_constant KernelData *data,
+        uint num_threads,
+        ccl_global uint64_t *size)
+{
+	((KernelGlobals*)kg)->data = data;
+	*size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+}
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
new file mode 100644
index 00000000000..7a1838e485f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_subsurface_scatter.h"
+
+__kernel void kernel_ocl_path_trace_subsurface_scatter(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	ccl_local unsigned int local_queue_atomics;
+	kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
deleted file mode 100644
index 88a1ed830af..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	kernel_sum_all_radiance(data,
-	                        buffer,
-	                        per_sample_output_buffer,
-	                        parallel_samples,
-	                        sw, sh, stride,
-	                        buffer_offset_x,
-	                        buffer_offset_y,
-	                        buffer_stride,
-	                        start_sample);
-}
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index 98de40e5a8a..d2eb89e0e0a 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	..
-	../svm
-	../../graph
-	../../render
-	../../util
-	../../device
+	../..
 )
 
 set(INC_SYS
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index d835f9be45c..2e73e7a601e 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -34,10 +34,10 @@
 
 #include <OSL/genclosure.h>
 
-#include "osl_closures.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_compat_cpu.h"
-#include "closure/alloc.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/closure/alloc.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index bc26f42b559..ea18f2c8c86 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,13 +34,13 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
-#include "closure/alloc.h"
-#include "closure/bsdf_diffuse_ramp.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index 14c7644936e..a26671eb09e 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,12 +34,12 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "closure/alloc.h"
-#include "closure/bsdf_phong_ramp.h"
+#include "kernel/kernel_types.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index 3f13e08b302..8843a196dad 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -34,12 +34,12 @@
 
 #include <OSL/genclosure.h>
 
-#include "osl_closures.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_compat_cpu.h"
-#include "kernel_types.h"
-#include "closure/alloc.h"
-#include "closure/emissive.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/emissive.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 44daefee249..188c3960a5f 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,17 +32,17 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_montecarlo.h"
 
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf_diffuse.h"
-#include "closure/bsdf_principled_diffuse.h"
-#include "closure/bssrdf.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -81,7 +81,7 @@ public:
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
 				bssrdf->roughness = params.roughness;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
 			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -93,7 +93,7 @@ public:
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
 				bssrdf->roughness = params.roughness;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
 			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -105,7 +105,7 @@ public:
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
 				bssrdf->roughness = params.roughness;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 		}
 	}
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 5570a22692e..5b66793a05d 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -33,35 +33,36 @@
 #include <OSL/genclosure.h>
 #include <OSL/oslclosure.h>
 
-#include "osl_closures.h"
-#include "osl_shader.h"
-
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_param.h"
-
-#include "kernel_types.h"
-#include "kernel_compat_cpu.h"
-#include "kernel_globals.h"
-#include "kernel_montecarlo.h"
-#include "kernel_random.h"
-
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf_ashikhmin_velvet.h"
-#include "closure/bsdf_diffuse.h"
-#include "closure/bsdf_microfacet.h"
-#include "closure/bsdf_microfacet_multi.h"
-#include "closure/bsdf_oren_nayar.h"
-#include "closure/bsdf_reflection.h"
-#include "closure/bsdf_refraction.h"
-#include "closure/bsdf_transparent.h"
-#include "closure/bsdf_ashikhmin_shirley.h"
-#include "closure/bsdf_toon.h"
-#include "closure/bsdf_hair.h"
-#include "closure/bsdf_principled_diffuse.h"
-#include "closure/bsdf_principled_sheen.h"
-#include "closure/volume.h"
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_shader.h"
+
+#include "util/util_debug.h"
+#include "util/util_math.h"
+#include "util/util_param.h"
+
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
+#include "kernel/closure/volume.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index c9740f81c8a..ff5fd9cc905 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -33,8 +33,8 @@
 #ifndef __OSL_CLOSURES_H__
 #define __OSL_CLOSURES_H__
 
-#include "util_types.h"
-#include "kernel_types.h"
+#include "util/util_types.h"
+#include "kernel/kernel_types.h"
 
 #include <OSL/oslclosure.h>
 #include <OSL/oslexec.h>
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 65cb7ecc6b4..02c083a83f8 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -21,10 +21,10 @@
 
 #include <OSL/oslexec.h>
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 #ifndef WIN32
 using std::isfinite;
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 58bbdc33920..b767c60c617 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -25,33 +25,34 @@
 
 #include <string.h>
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-
-#include "osl_closures.h"
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_string.h"
-
-#include "kernel_compat_cpu.h"
-#include "kernel_globals.h"
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_differential.h"
-#include "kernel_montecarlo.h"
-#include "kernel_camera.h"
-#include "kernels/cpu/kernel_cpu_image.h"
-#include "geom/geom.h"
-#include "bvh/bvh.h"
-
-#include "kernel_projection.h"
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_string.h"
+
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernels/cpu/kernel_cpu_image.h"
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
+
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
 
 #ifdef WITH_PTEX
 #  include <Ptexture.h>
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 0d762bbdb38..13b19d86eca 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -16,21 +16,22 @@
 
 #include <OSL/oslexec.h>
 
-#include "kernel_compat_cpu.h"
-#include "kernel_montecarlo.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
 
-#include "geom/geom_object.h"
+#include "kernel/geom/geom_object.h"
 
-#include "osl_closures.h"
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
 
-#include "util_foreach.h"
+#include "util/util_foreach.h"
 
-#include "attribute.h"
+#include "render/attribute.h"
 
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index ad06dd6929d..32121e940b4 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -29,7 +29,7 @@
  * This means no thread state must be passed along in the kernel itself.
  */
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
deleted file mode 100644
index 9bfa71c75ef..00000000000
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_background_buffer_update kernel.
- * This is the fourth kernel in the ray tracing logic, and the third
- * of the path iteration kernels. This kernel takes care of rays that hit
- * the background (sceneintersect kernel), and for the rays of
- * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
- * the output buffer. This kernel also takes care of rays that have been determined
- * to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
- * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
- * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
- * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
- * Ray_coop ---------------------------------------------|                                      |--- ray_state
- * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
- * parallel_samples -------------------------------------|                                      |--- PathState_coop
- * end_sample -------------------------------------------|                                      |--- throughput_coop
- * kg (globals) -----------------------------------------|                                      |--- rng_coop
- * rng_state --------------------------------------------|                                      |--- Ray
- * PathRadiance_coop ------------------------------------|                                      |
- * sw ---------------------------------------------------|                                      |
- * sh ---------------------------------------------------|                                      |
- * sx ---------------------------------------------------|                                      |
- * sy ---------------------------------------------------|                                      |
- * stride -----------------------------------------------|                                      |
- * work_array -------------------------------------------|                                      |--- work_array
- * queuesize --------------------------------------------|                                      |
- * start_sample -----------------------------------------|                                      |--- work_pool_wgs
- * work_pool_wgs ----------------------------------------|                                      |
- * num_samples ------------------------------------------|                                      |
- *
- * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
- * Note on Queues :
- * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * State of queues when this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
- */
-ccl_device char kernel_background_buffer_update(
-        KernelGlobals *kg,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index)
-{
-	char enqueue_flag = 0;
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
-	ccl_global PathState *state = &PathState_coop[ray_index];
-	PathRadiance *L = L = &PathRadiance_coop[ray_index];
-	ccl_global Ray *ray = &Ray_coop[ray_index];
-	ccl_global float3 *throughput = &throughput_coop[ray_index];
-	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
-	ccl_global uint *rng = &rng_coop[ray_index];
-
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
-	ccl_global float *initial_per_sample_output_buffers;
-	ccl_global uint *initial_rng;
-#endif
-	unsigned int sample;
-	unsigned int tile_x;
-	unsigned int tile_y;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-	unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-	my_work = work_array[ray_index];
-	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-	get_pixel_tile_position(&pixel_x, &pixel_y,
-	                        &tile_x, &tile_y,
-	                        my_work,
-	                        sw, sh, sx, sy,
-	                        parallel_samples,
-	                        ray_index);
-	my_sample_tile = 0;
-	initial_per_sample_output_buffers = per_sample_output_buffers;
-	initial_rng = rng_state;
-#else  /* __WORK_STEALING__ */
-	sample = work_array[ray_index];
-	int tile_index = ray_index / parallel_samples;
-	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
-	tile_x = tile_index % sw;
-	tile_y = tile_index / sw;
-	my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-
-	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		/* eval background shader if nothing hit */
-		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
-			*L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
-			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, kg->sd_input, state, ray);
-			path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
-#endif
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-		}
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
-#ifdef __KERNEL_DEBUG__
-		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
-#endif
-		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
-
-		/* accumulate result in output buffer */
-		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-		path_rng_end(kg, rng_state, *rng);
-
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-		/* We have completed current work; So get next work */
-		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		if(!valid_work) {
-			/* If work is invalid, this means no more work is available and the thread may exit */
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#else  /* __WORK_STEALING__ */
-		if((sample + parallel_samples) >= end_sample) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#endif  /* __WORK_STEALING__ */
-
-		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-			work_array[ray_index] = my_work;
-			/* Get the sample associated with the current work */
-			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-			/* Get pixel and tile position associated with current work */
-			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			my_sample_tile = 0;
-
-			/* Remap rng_state according to the current work */
-			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
-			/* Remap per_sample_output_buffers according to the current work */
-			per_sample_output_buffers = initial_per_sample_output_buffers
-				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-#else  /* __WORK_STEALING__ */
-			work_array[ray_index] = sample + parallel_samples;
-			sample = work_array[ray_index];
-
-			/* Get ray position from ray index */
-			pixel_x = sx + ((ray_index / parallel_samples) % sw);
-			pixel_y = sy + ((ray_index / parallel_samples) / sw);
-#endif  /* __WORK_STEALING__ */
-
-			/* Initialize random numbers and ray. */
-			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
-
-			if(ray->t != 0.0f) {
-				/* Initialize throughput, L_transparent, Ray, PathState;
-				 * These rays proceed with path-iteration.
-				 */
-				*throughput = make_float3(1.0f, 1.0f, 1.0f);
-				*L_transparent = 0.0f;
-				path_radiance_init(L, kernel_data.film.use_light_pass);
-				path_state_init(kg, kg->sd_input, state, rng, sample, ray);
-#ifdef __KERNEL_DEBUG__
-				debug_data_init(debug_data);
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-				enqueue_flag = 1;
-			}
-			else {
-				/* These rays do not participate in path-iteration. */
-				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				/* Accumulate result in output buffer. */
-				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-				path_rng_end(kg, rng_state, *rng);
-
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-			}
-		}
-	}
-	return enqueue_flag;
-}
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
new file mode 100644
index 00000000000..859c221d976
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel takes care of rays that hit the background (sceneintersect
+ * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
+ * accumulated radiance in the output buffer. This kernel also takes care of
+ * rays that have been determined to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel.
+ * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
+ * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queues when this kernel is called:
+ * At entry,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
+ * At exit,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ */
+ccl_device void kernel_buffer_update(KernelGlobals *kg,
+                                     ccl_local_param unsigned int *local_queue_atomics)
+{
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	ccl_global uint *rng_state = kernel_split_params.rng_state;
+	int stride = kernel_split_params.stride;
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
+#endif
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	ccl_global float *buffer = kernel_split_params.buffer;
+
+	unsigned int work_index;
+	ccl_global uint *initial_rng;
+
+	unsigned int sample;
+	unsigned int tile_x;
+	unsigned int tile_y;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+
+	work_index = kernel_split_state.work_array[ray_index];
+	sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
+	                        &tile_x, &tile_y,
+	                        work_index,
+	                        ray_index);
+	initial_rng = rng_state;
+
+	rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
+		}
+		else
+#endif  /* __SHADOW_TRICKS__ */
+		{
+			L_sum = path_radiance_clamp_and_sum(kg, L);
+		}
+		kernel_write_light_passes(kg, buffer, L, sample);
+#ifdef __KERNEL_DEBUG__
+		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
+#endif
+		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+		/* accumulate result in output buffer */
+		kernel_write_pass_float4(buffer, sample, L_rad);
+		path_rng_end(kg, rng_state, rng);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+		/* We have completed current work; So get next work */
+		int valid_work = get_next_work(kg, &work_index, ray_index);
+		if(!valid_work) {
+			/* If work is invalid, this means no more work is available and the thread may exit */
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+			kernel_split_state.work_array[ray_index] = work_index;
+			/* Get the sample associated with the current work */
+			sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+			/* Get pixel and tile position associated with current work */
+			get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
+
+			/* Remap rng_state according to the current work */
+			rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
+			/* Remap buffer according to the current work */
+			buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+
+			/* Initialize random numbers and ray. */
+			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng, ray);
+
+			if(ray->t != 0.0f) {
+				/* Initialize throughput, L_transparent, Ray, PathState;
+				 * These rays proceed with path-iteration.
+				 */
+				*throughput = make_float3(1.0f, 1.0f, 1.0f);
+				*L_transparent = 0.0f;
+				path_radiance_init(L, kernel_data.film.use_light_pass);
+				path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &rng, sample, ray);
+#ifdef __SUBSURFACE__
+				kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+#ifdef __KERNEL_DEBUG__
+				debug_data_init(debug_data);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				enqueue_flag = 1;
+			}
+			else {
+				/* These rays do not participate in path-iteration. */
+				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				/* Accumulate result in output buffer. */
+				kernel_write_pass_float4(buffer, sample, L_rad);
+				path_rng_end(kg, rng_state, rng);
+
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+			}
+		}
+	}
+	kernel_split_state.rng[ray_index] = rng;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 6e158d53d23..9d3d01fff75 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -14,108 +14,105 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_data_initialization kernel
- * This kernel Initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
+/* This kernel Initializes structures needed in path-iteration kernels.
  *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- *
- * Its input and output are as follows,
- *
- * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
- * Un-initialized throughput -------|                                  |--- Initialized throughput
- * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
- * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
- * Un-initialized Ray --------------|                                  |--- Initialized Ray
- * Un-initialized PathState --------|                                  |--- Initialized PathState
- * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
- * Un-initialized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
- * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
- * Un-initialized ray_state --------|                                  |--- Initialized ray_state
- * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
- * rng_state -----------------------|                                  |--- Initialized work_array
- * data ----------------------------|                                  |--- Initialized work_pool_wgs
- * start_sample --------------------|                                  |
- * sx ------------------------------|                                  |
- * sy ------------------------------|                                  |
- * sw ------------------------------|                                  |
- * sh ------------------------------|                                  |
- * stride --------------------------|                                  |
- * queuesize -----------------------|                                  |
- * num_samples ---------------------|                                  |
- *
- * Note on Queues :
+ * Note on Queues:
  * All slots in queues are initialized to queue empty slot;
  * The number of elements in the queues is initialized to 0;
  */
+
+/* Distributes an amount of work across all threads
+ * note: work done inside the loop may not show up to all threads till after
+ * the current kernel has completed
+ */
+#define parallel_for(kg, iter_name, work_size) \
+	for(size_t _size = (work_size), \
+	    _global_size = ccl_global_size(0) * ccl_global_size(1), \
+	    _n = _size / _global_size, \
+		_thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \
+	    iter_name = (_n > 0) ? (_thread * _n) : (_thread) \
+		; \
+		(iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \
+		; \
+		iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \
+	)
+
+#ifndef __KERNEL_CPU__
 ccl_device void kernel_data_init(
+#else
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+#endif
         KernelGlobals *kg,
-        ShaderData *sd_DL_shadow,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
         ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
 
+#ifdef __KERNEL_OPENCL__
 #define KERNEL_TEX(type, ttype, name)                                   \
         ccl_global type *name,
-#include "../kernel_textures.h"
+#include "kernel/kernel_textures.h"
+#endif
 
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global unsigned int *work_pools,      /* Work pool for each work group */
+        unsigned int num_samples,
+        ccl_global float *buffer)
 {
+#ifdef __KERNEL_OPENCL__
 	kg->data = data;
-	kg->sd_input = sd_DL_shadow;
-	kg->isect_shadow = Intersection_coop_shadow;
+#endif
+
+	kernel_split_params.x = sx;
+	kernel_split_params.y = sy;
+	kernel_split_params.w = sw;
+	kernel_split_params.h = sh;
+
+	kernel_split_params.offset = offset;
+	kernel_split_params.stride = stride;
+
+	kernel_split_params.rng_state = rng_state;
+
+	kernel_split_params.start_sample = start_sample;
+	kernel_split_params.end_sample = end_sample;
+
+	kernel_split_params.work_pools = work_pools;
+	kernel_split_params.num_samples = num_samples;
+
+	kernel_split_params.queue_index = Queue_index;
+	kernel_split_params.queue_size = queuesize;
+	kernel_split_params.use_queues_flag = use_queues_flag;
+
+	kernel_split_params.buffer = buffer;
+
+	split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
+
+#ifdef __KERNEL_OPENCL__
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../kernel_textures.h"
-
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+#include "kernel/kernel_textures.h"
+#endif
 
-#ifdef __WORK_STEALING__
-	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	/* Initialize work_pool_wgs */
-	if(lid == 0) {
-		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
-		work_pool_wgs[group_index] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-#endif  /* __WORK_STEALING__ */
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 
 	/* Initialize queue data and queue index. */
 	if(thread_index < queuesize) {
 		/* Initialize active ray queue. */
-		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize background and buffer update queue. */
-		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize shadow ray cast of AO queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize shadow ray cast of direct lighting queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 	}
 
 	if(thread_index == 0) {
@@ -126,109 +123,31 @@ ccl_device void kernel_data_init(
 		/* The scene-intersect kernel should not use the queues very first time.
 		 * since the queue would be empty.
 		 */
-		use_queues_flag[0] = 0;
+		*use_queues_flag = 0;
 	}
 
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+	/* zero the tiles pixels and initialize rng_state if this is the first sample */
+	if(start_sample == 0) {
+		parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
+			int pixel = i / kernel_data.film.pass_stride;
+			int pass = i % kernel_data.film.pass_stride;
 
-	if(x < (sw * parallel_samples) && y < sh) {
-		int ray_index = x + y * (sw * parallel_samples);
+			int x = sx + pixel % sw;
+			int y = sy + pixel / sw;
 
-		/* This is the first assignment to ray_state;
-		 * So we dont use ASSIGN_RAY_STATE macro.
-		 */
-		ray_state[ray_index] = RAY_ACTIVE;
-
-		unsigned int my_sample;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
-		unsigned int tile_x;
-		unsigned int tile_y;
-		unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-		unsigned int my_work = 0;
-		/* Get work. */
-		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		/* Get the sample associated with the work. */
-		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-
-		my_sample_tile = 0;
-
-		/* Get pixel and tile position associated with the work. */
-		get_pixel_tile_position(&pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
-		                        ray_index);
-		work_array[ray_index] = my_work;
-#else  /* __WORK_STEALING__ */
-		unsigned int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-		my_sample = my_sample_tile + start_sample;
-
-		/* Initialize work array. */
-		work_array[ray_index] = my_sample ;
-
-		/* Calculate pixel position of this ray. */
-		pixel_x = sx + tile_x;
-		pixel_y = sy + tile_y;
-#endif  /* __WORK_STEALING__ */
-
-		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-
-		/* Initialise per_sample_output_buffers to all zeros. */
-		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
-		int per_sample_output_buffers_iterator = 0;
-		for(per_sample_output_buffers_iterator = 0;
-		    per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
-		    per_sample_output_buffers_iterator++)
-		{
-			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
-		}
+			int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
 
-		/* Initialize random numbers and ray. */
-		kernel_path_trace_setup(kg,
-		                        rng_state,
-		                        my_sample,
-		                        pixel_x, pixel_y,
-		                        &rng_coop[ray_index],
-		                        &Ray_coop[ray_index]);
-
-		if(Ray_coop[ray_index].t != 0.0f) {
-			/* Initialize throughput, L_transparent, Ray, PathState;
-			 * These rays proceed with path-iteration.
-			 */
-			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-			L_transparent_coop[ray_index] = 0.0f;
-			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
-			path_state_init(kg,
-			                kg->sd_input,
-			                &PathState_coop[ray_index],
-			                &rng_coop[ray_index],
-			                my_sample,
-			                &Ray_coop[ray_index]);
-#ifdef __KERNEL_DEBUG__
-			debug_data_init(&debugdata_coop[ray_index]);
-#endif
+			*(buffer + index) = 0.0f;
 		}
-		else {
-			/* These rays do not participate in path-iteration. */
-			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			/* Accumulate result in output buffer. */
-			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
-			path_rng_end(kg, rng_state, rng_coop[ray_index]);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-		}
-	}
 
-	/* Mark rest of the ray-state indices as RAY_INACTIVE. */
-	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
-		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
-		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+		parallel_for(kg, i, sw * sh) {
+			int x = sx + i % sw;
+			int y = sy + i / sw;
+
+			int index = (offset + x + y*stride);
+			*(rng_state + index) = hash_int_2d(x, y);
+		}
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 82ca18829d3..bdbf7387b95 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -14,95 +14,144 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_direct_lighting kernel.
- * This is the eighth kernel in the ray tracing logic. This is the seventh
- * of the path iteration kernels. This kernel takes care of direct lighting
- * logic. However, the "shadow ray cast" part of direct lighting is handled
+/* This kernel takes care of direct lighting logic.
+ * However, the "shadow ray cast" part of direct lighting is handled
  * in the next kernel.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
- * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with direct lighting should be executed. Those rays for which
+ * a shadow_blocked() function for direct-lighting must be executed, are
+ * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
+ * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
+ * the corresponding shadow_blocked part, after direct lighting, the ray is
+ * marked with RAY_SHADOW_RAY_CAST_DL flag.
  *
- * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
- * PathState_coop -----------------------------------|                             |--- ISLamp_coop
- * sd -----------------------------------------------|                             |--- LightRay_coop
- * ray_state ----------------------------------------|                             |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
- * kg (globals) -------------------------------------|                             |
- * queuesize ----------------------------------------|                             |
- *
- * Note on Queues :
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
- * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
- * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ * State of queues when this kernel is called:
+ * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
+ *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
+ *   kernel call.
+ * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
+ *   shadow_blocked function must be executed, after this kernel call
+ *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
  */
-ccl_device char kernel_direct_lighting(
-        KernelGlobals *kg,
-        ShaderData *sd,                         /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_direct_lighting(KernelGlobals *kg,
+                                       ccl_local_param unsigned int *local_queue_atomics)
 {
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
 	char enqueue_flag = 0;
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global PathState *state = &PathState_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
 
 		/* direct lighting */
 #ifdef __EMISSION__
-		if((kernel_data.integrator.use_direct_light &&
-		    (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
-		{
+		RNG rng = kernel_split_state.rng[ray_index];
+		bool flag = (kernel_data.integrator.use_direct_light &&
+		             (sd->flag & SD_BSDF_HAS_EVAL));
+#  ifdef __SHADOW_TRICKS__
+		if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
+			flag = false;
+			ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+			float3 throughput = kernel_split_state.throughput[ray_index];
+			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+			kernel_branched_path_surface_connect_light(kg,
+			                                           &rng,
+			                                           sd,
+			                                           emission_sd,
+			                                           state,
+			                                           throughput,
+			                                           1.0f,
+			                                           L,
+			                                           1);
+		}
+#  endif  /* __SHADOW_TRICKS__ */
+		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
-			ccl_global RNG* rng = &rng_coop[ray_index];
-			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+			float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
 			float light_u, light_v;
-			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-			float terminate = path_state_rng_light_termination(kg, rng, state);
+			path_state_rng_2D(kg, &rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+			float terminate = path_state_rng_light_termination(kg, &rng, state);
 
 			LightSample ls;
 			if(light_sample(kg,
 			                light_t, light_u, light_v,
-			                ccl_fetch(sd, time),
-			                ccl_fetch(sd, P),
+			                sd->time,
+			                sd->P,
 			                state->bounce,
 			                &ls)) {
 
 				Ray light_ray;
-#ifdef __OBJECT_MOTION__
-				light_ray.time = ccl_fetch(sd, time);
-#endif
+#  ifdef __OBJECT_MOTION__
+				light_ray.time = sd->time;
+#  endif
 
 				BsdfEval L_light;
 				bool is_lamp;
-				if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+				if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 					/* Write intermediate data to global memory to access from
 					 * the next kernel.
 					 */
-					LightRay_coop[ray_index] = light_ray;
-					BSDFEval_coop[ray_index] = L_light;
-					ISLamp_coop[ray_index] = is_lamp;
+					kernel_split_state.light_ray[ray_index] = light_ray;
+					kernel_split_state.bsdf_eval[ray_index] = L_light;
+					kernel_split_state.is_lamp[ray_index] = is_lamp;
 					/* Mark ray state for next shadow kernel. */
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+					ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 					enqueue_flag = 1;
 				}
 			}
 		}
+		kernel_split_state.rng[ray_index] = rng;
 #endif  /* __EMISSION__ */
 	}
-	return enqueue_flag;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
new file mode 100644
index 00000000000..47d3c280831
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_do_volume(KernelGlobals *kg)
+{
+#ifdef __VOLUME__
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+	/* Fetch use_queues_flag. */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          1);
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+
+		bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
+
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+
+		/* Sanitize volume stack. */
+		if(!hit) {
+			kernel_volume_clean_stack(kg, state->volume_stack);
+		}
+		/* volume attenuation, emission, scatter */
+		if(state->volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = *ray;
+			volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+			{
+				/* integrate along volume segment with distance sampling */
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
+						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
+					else
+						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+				}
+#  endif
+			}
+		}
+		kernel_split_state.rng[ray_index] = rng;
+	}
+
+#endif
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 5d951b972ed..9fc853a84bf 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -14,157 +14,159 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
- * This is the sixth kernel in the ray tracing logic. This is the fifth
- * of the path iteration kernels. This kernel takes care of the logic to process
- * "material of type holdout", indirect primitive emission, bsdf blurring,
- * probabilistic path termination and AO.
+/* This kernel takes care of the logic to process "material of type holdout",
+ * indirect primitive emission, bsdf blurring, probabilistic path termination
+ * and AO.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
- * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with AO should be executed. Those rays for which a
+ * shadow_blocked() function for AO must be executed are marked with flag
+ * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS
  *
  * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFFER, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                                           |--- PathState_coop
- * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
- * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
- * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
- * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                                           |--- ShaderData
- * ray_state --------------------------------------------|                                                           |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
- * kg (globals) -----------------------------------------|                                                           |--- AOBSDF_coop
- * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
- * per_sample_output_buffers ----------------------------|                                                           |
- * sw ---------------------------------------------------|                                                           |
- * sh ---------------------------------------------------|                                                           |
- * sx ---------------------------------------------------|                                                           |
- * sy ---------------------------------------------------|                                                           |
- * stride -----------------------------------------------|                                                           |
- * work_array -------------------------------------------|                                                           |
- * queuesize --------------------------------------------|                                                           |
- * start_sample -----------------------------------------|                                                           |
- *
- * Note on Queues :
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
+ *     flag RAY_SHADOW_RAY_CAST_AO
  */
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         KernelGlobals *kg,
-        ShaderData *sd,                        /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index,
-        char *enqueue_flag,
-        char *enqueue_flag_AO_SHADOW_RAY_CAST)
+        ccl_local_param BackgroundAOLocals *locals)
 {
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		locals->queue_atomics_bg = 0;
+		locals->queue_atomics_ao = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	int stride = kernel_split_params.stride;
+
+	unsigned int work_index;
 	unsigned int pixel_x;
 	unsigned int pixel_y;
-#endif
+
 	unsigned int tile_x;
 	unsigned int tile_y;
-	int my_sample_tile;
 	unsigned int sample;
 
-	ccl_global RNG *rng = 0x0;
+	RNG rng = kernel_split_state.rng[ray_index];
 	ccl_global PathState *state = 0x0;
 	float3 throughput;
 
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ccl_global float *buffer = kernel_split_params.buffer;
+
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 
-		throughput = throughput_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		rng = &rng_coop[ray_index];
-#ifdef __WORK_STEALING__
-		my_work = work_array[ray_index];
-		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-		get_pixel_tile_position(&pixel_x, &pixel_y,
+		throughput = kernel_split_state.throughput[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+
+		work_index = kernel_split_state.work_array[ray_index];
+		sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+		get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
 		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
+		                        work_index,
 		                        ray_index);
-		my_sample_tile = 0;
-#else  /* __WORK_STEALING__ */
-		sample = work_array[ray_index];
-		/* Buffer's stride is "stride"; Find x and y using ray_index. */
-		int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-		per_sample_output_buffers +=
-		    (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
-		    kernel_data.film.pass_stride;
+
+		buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
+
+#ifdef __SHADOW_TRICKS__
+		if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+			if (state->flag & PATH_RAY_CAMERA) {
+				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state->catcher_object = sd->object;
+				if(!kernel_data.background.transparent) {
+					PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+					ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+					L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
+				}
+			}
+		}
+		else {
+			state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
+		}
+#endif  /* __SHADOW_TRICKS__ */
 
 		/* holdout */
 #ifdef __HOLDOUT__
-		if(((ccl_fetch(sd, flag) & SD_HOLDOUT) ||
-		    (ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK)) &&
+		if(((sd->flag & SD_HOLDOUT) ||
+		    (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
 		   (state->flag & PATH_RAY_CAMERA))
 		{
 			if(kernel_data.background.transparent) {
 				float3 holdout_weight;
-				if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
+				if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
 					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
 				}
 				else {
 					holdout_weight = shader_holdout_eval(kg, sd);
 				}
 				/* any throughput is ok, should all be identical here */
-				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+				kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
 			}
-			if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
+			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				*enqueue_flag = 1;
+				enqueue_flag = 1;
 			}
 		}
 #endif  /* __HOLDOUT__ */
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		PathRadiance *L = &PathRadiance_coop[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 		/* Holdout mask objects do not write data passes. */
 		kernel_write_data_passes(kg,
-		                         per_sample_output_buffers,
+		                         buffer,
 		                         L,
 		                         sd,
 		                         sample,
@@ -183,12 +185,12 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 #ifdef __EMISSION__
 		/* emission */
-		if(ccl_fetch(sd, flag) & SD_EMISSION) {
+		if(sd->flag & SD_EMISSION) {
 			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
 			float3 emission = indirect_primitive_emission(
 			        kg,
 			        sd,
-			        Intersection_coop[ray_index].t,
+			        kernel_split_state.isect[ray_index].t,
 			        state->flag,
 			        state->ray_pdf);
 			path_radiance_accum_emission(L, throughput, emission, state->bounce);
@@ -203,18 +205,18 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 		if(probability == 0.0f) {
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			*enqueue_flag = 1;
+			enqueue_flag = 1;
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 			if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+				float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
 				if(terminate >= probability) {
 					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					*enqueue_flag = 1;
+					enqueue_flag = 1;
 				}
 				else {
-					throughput_coop[ray_index] = throughput/probability;
+					kernel_split_state.throughput[ray_index] = throughput/probability;
 				}
 			}
 		}
@@ -224,37 +226,65 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion ||
-		   (ccl_fetch(sd, flag) & SD_AO))
+		   (sd->flag & SD_AO))
 		{
 			/* todo: solve correlation */
 			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+			path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
-			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+			kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+			kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
 
 			float3 ao_D;
 			float ao_pdf;
 			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 				Ray _ray;
-				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+				_ray.P = ray_offset(sd->P, sd->Ng);
 				_ray.D = ao_D;
 				_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-				_ray.time = ccl_fetch(sd, time);
+				_ray.time = sd->time;
 #endif
-				_ray.dP = ccl_fetch(sd, dP);
+				_ray.dP = sd->dP;
 				_ray.dD = differential3_zero();
-				AOLightRay_coop[ray_index] = _ray;
+				kernel_split_state.ao_light_ray[ray_index] = _ray;
 
 				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+				enqueue_flag_AO_SHADOW_RAY_CAST = 1;
 			}
 		}
 	}
 #endif  /* __AO__ */
+	kernel_split_state.rng[ray_index] = rng;
+
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &locals->queue_atomics_bg,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        kernel_split_params.queue_size,
+	                        &locals->queue_atomics_ao,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
new file mode 100644
index 00000000000..8192528622e
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_background(KernelGlobals *kg)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int ray_index;
+
+	if(kernel_data.integrator.ao_bounces) {
+		ray_index = get_ray_index(kg, thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
+
+		if(ray_index != QUEUE_EMPTY_SLOT) {
+			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+				if(state->bounce > kernel_data.integrator.ao_bounces) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				}
+			}
+		}
+	}
+
+	ray_index = get_ray_index(kg, thread_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		/* eval background shader if nothing hit */
+		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+			*L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+#ifdef __BACKGROUND__
+			/* sample background shader */
+			float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
+			path_radiance_accum_background(L, state, (*throughput), L_background);
+#endif
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+	}
+
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
new file mode 100644
index 00000000000..a56e85abeb9
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+
+	int ray_index;
+	get_ray_index(kg, thread_index,
+	              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
+	ray_index = get_ray_index(kg, thread_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __SUBSURFACE__
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+		kernel_path_subsurface_accum_indirect(ss_indirect, L);
+
+		/* Trace indirect subsurface rays by restarting the loop. this uses less
+		 * stack memory than invoking kernel_path_indirect.
+		 */
+		if(ss_indirect->num_rays) {
+			kernel_path_subsurface_setup_indirect(kg,
+			                                      ss_indirect,
+			                                      state,
+			                                      ray,
+			                                      L,
+			                                      throughput);
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+		else {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+	}
+
+#endif  /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index 3bd0e361078..c669d79ddcd 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -14,50 +14,49 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_lamp_emission
- * This is the 3rd kernel in the ray-tracing logic. This is the second of the
- * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
- * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
- * and RAY_HIT_BACKGROUND.
+/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
  * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- * The input/output of the kernel is as follows,
- * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
- * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * kg (globals) ---------------------------------------|                           |
- * Intersection_coop ----------------------------------|                           |
- * ray_state ------------------------------------------|                           |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
- * queuesize ------------------------------------------|                           |
- * use_queues_flag ------------------------------------|                           |
- * sw -------------------------------------------------|                           |
- * sh -------------------------------------------------|                           |
  */
-ccl_device void kernel_lamp_emission(
-        KernelGlobals *kg,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int ray_index)
+ccl_device void kernel_lamp_emission(KernelGlobals *kg)
 {
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+#ifndef __VOLUME__
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+#endif
+	/* Fetch use_queues_flag. */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+#ifndef __VOLUME__
+		                          1
+#else
+		                          0
+#endif
+		                          );
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND))
 	{
-		PathRadiance *L = &PathRadiance_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-		float3 throughput = throughput_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
 
 #ifdef __LAMP_MIS__
 		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
@@ -65,7 +64,7 @@ ccl_device void kernel_lamp_emission(
 			Ray light_ray;
 
 			light_ray.P = ray.P - state->ray_t*ray.D;
-			state->ray_t += Intersection_coop[ray_index].t;
+			state->ray_t += kernel_split_state.isect[ray_index].t;
 			light_ray.D = ray.D;
 			light_ray.t = state->ray_t;
 			light_ray.time = ray.time;
@@ -74,10 +73,13 @@ ccl_device void kernel_lamp_emission(
 			/* intersect with lamp */
 			float3 emission;
 
-			if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) {
+			if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) {
 				path_radiance_accum_emission(L, throughput, emission, state->bounce);
 			}
 		}
 #endif  /* __LAMP_MIS__ */
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 816f3a6fbff..1bebc16e25b 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -14,128 +14,163 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_setup_next_iteration kernel.
- * This is the tenth kernel in the ray tracing logic. This is the ninth
- * of the path iteration kernels. This kernel takes care of setting up
- * Ray for the next iteration of path-iteration and accumulating radiance
- * corresponding to AO and direct-lighting
+/*This kernel takes care of setting up ray for the next iteration of
+ * path-iteration and accumulating radiance corresponding to AO and
+ * direct-lighting
  *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ * Ray state of rays that are terminated in this kernel are changed
+ * to RAY_UPDATE_BUFFER.
  *
- * The input and output are as follows,
+ * Note on queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFF state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFF, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
- * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                 |--- PathState_coop
- * ray_state --------------------------------------------|                                 |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
- * Ray_coop ---------------------------------------------|                                 |
- * kg (globals) -----------------------------------------|                                 |
- * LightRay_dl_coop -------------------------------------|
- * ISLamp_coop ------------------------------------------|
- * BSDFEval_coop ----------------------------------------|
- * LightRay_ao_coop -------------------------------------|
- * AOBSDF_coop ------------------------------------------|
- * AOAlpha_coop -----------------------------------------|
- *
- * Note on queues,
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
-ccl_device char kernel_next_iteration_setup(
-        KernelGlobals *kg,
-        ShaderData *sd,                       /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
-        int ray_index)
+ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
+                                            ccl_local_param unsigned int *local_queue_atomics)
 {
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		*kernel_split_params.use_queues_flag = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
 	char enqueue_flag = 0;
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
 
 	/* Load ShaderData structure. */
 	PathRadiance *L = NULL;
 	ccl_global PathState *state = NULL;
+	ccl_global char *ray_state = kernel_split_state.ray_state;
 
 	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
 	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
 	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
 	{
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
-		float3 _throughput = throughput_coop[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+		L = &kernel_split_state.path_radiance[ray_index];
+		float3 _throughput = kernel_split_state.throughput[ray_index];
 
 		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			float3 shadow = LightRay_ao_coop[ray_index].P;
-			char update_path_radiance = LightRay_ao_coop[ray_index].t;
+			float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
+			// TODO(mai): investigate correctness here
+			char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
 			if(update_path_radiance) {
 				path_radiance_accum_ao(L,
 				                       _throughput,
-				                       AOAlpha_coop[ray_index],
-				                       AOBSDF_coop[ray_index],
+				                       kernel_split_state.ao_alpha[ray_index],
+				                       kernel_split_state.ao_bsdf[ray_index],
 				                       shadow,
 				                       state->bounce);
 			}
+			else {
+				path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
+			}
 			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
 		}
 
 		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-			float3 shadow = LightRay_dl_coop[ray_index].P;
-			char update_path_radiance = LightRay_dl_coop[ray_index].t;
+			float3 shadow = kernel_split_state.light_ray[ray_index].P;
+			// TODO(mai): investigate correctness here
+			char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
+			BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
 			if(update_path_radiance) {
-				BsdfEval L_light = BSDFEval_coop[ray_index];
 				path_radiance_accum_light(L,
 				                          _throughput,
 				                          &L_light,
 				                          shadow,
 				                          1.0f,
 				                          state->bounce,
-				                          ISLamp_coop[ray_index]);
+				                          kernel_split_state.is_lamp[ray_index]);
+			}
+			else {
+				path_radiance_accum_total_light(L, _throughput, &L_light);
 			}
 			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 		}
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global float3 *throughput = &throughput_coop[ray_index];
-		ccl_global Ray *ray = &Ray_coop[ray_index];
-		ccl_global RNG *rng = &rng_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+		L = &kernel_split_state.path_radiance[ray_index];
 
 		/* Compute direct lighting and next bounce. */
-		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+		if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 			enqueue_flag = 1;
 		}
+		kernel_split_state.rng[ray_index] = rng;
 	}
 
-	return enqueue_flag;
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
new file mode 100644
index 00000000000..a7ecde7c80d
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ */
+ccl_device void kernel_path_init(KernelGlobals *kg) {
+	int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	/* This is the first assignment to ray_state;
+	 * So we dont use ASSIGN_RAY_STATE macro.
+	 */
+	kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
+
+	unsigned int my_sample;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+	unsigned int tile_x;
+	unsigned int tile_y;
+
+	unsigned int work_index = 0;
+	/* Get work. */
+	if(!get_next_work(kg, &work_index, ray_index)) {
+		/* No more work, mark ray as inactive */
+		kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
+
+		return;
+	}
+
+	/* Get the sample associated with the work. */
+	my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+
+	/* Get pixel and tile position associated with the work. */
+	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
+	                             &tile_x, &tile_y,
+	                             work_index,
+	                             ray_index);
+	kernel_split_state.work_array[ray_index] = work_index;
+
+	ccl_global uint *rng_state = kernel_split_params.rng_state;
+	rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
+
+	ccl_global float *buffer = kernel_split_params.buffer;
+	buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
+
+	RNG rng = kernel_split_state.rng[ray_index];
+
+	/* Initialize random numbers and ray. */
+	kernel_path_trace_setup(kg,
+	                        rng_state,
+	                        my_sample,
+	                        pixel_x, pixel_y,
+	                        &rng,
+	                        &kernel_split_state.ray[ray_index]);
+
+	if(kernel_split_state.ray[ray_index].t != 0.0f) {
+		/* Initialize throughput, L_transparent, Ray, PathState;
+		 * These rays proceed with path-iteration.
+		 */
+		kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+		kernel_split_state.L_transparent[ray_index] = 0.0f;
+		path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
+		path_state_init(kg,
+		                &kernel_split_state.sd_DL_shadow[ray_index],
+		                &kernel_split_state.path_state[ray_index],
+		                &rng,
+		                my_sample,
+		                &kernel_split_state.ray[ray_index]);
+#ifdef __SUBSURFACE__
+		kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+		debug_data_init(&kernel_split_state.debug_data[ray_index]);
+#endif
+	}
+	else {
+		/* These rays do not participate in path-iteration. */
+		float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		/* Accumulate result in output buffer. */
+		kernel_write_pass_float4(buffer, my_sample, L_rad);
+		path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+	kernel_split_state.rng[ray_index] = rng;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
new file mode 100644
index 00000000000..e2e841f36d3
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel enqueues rays of different ray state into their
+ * appropriate queues:
+ *
+ * 1. Rays that have been determined to hit the background from the
+ *    "kernel_scene_intersect" kernel are enqueued in
+ *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in pat
+ *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queue during other times this kernel is called:
+ * At entry,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
+ *     and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
+                                     ccl_local_param QueueEnqueueLocals *locals)
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	if(lidx == 0) {
+		locals->queue_atomics[0] = 0;
+		locals->queue_atomics[1] = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    locals->queue_atomics,
+		                                    kernel_split_params.queue_index);
+		locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    locals->queue_atomics,
+		                                    kernel_split_params.queue_index);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  kernel_split_params.queue_size,
+		                                  my_lqidx,
+		                                  locals->queue_atomics);
+		kernel_split_state.queue_data[my_gqidx] = ray_index;
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 2388580051f..684760eedee 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -14,81 +14,47 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_scene_intersect kernel.
- * This is the second kernel in the ray tracing logic. This is the first
- * of the path iteration kernels. This kernel takes care of scene_intersect function.
+/* This kernel takes care of scene_intersect function.
  *
  * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
  * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
- *
- * The input and output are as follows,
- *
- * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
- * PathState_coop ---------------------------------|                                          |--- Intersection
- * ray_state --------------------------------------|                                          |--- ray_state
- * use_queues_flag --------------------------------|                                          |
- * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
- * kg (globals) -----------------------------------|                                          |
- * rng_coop ---------------------------------------|                                          |
- * sw ---------------------------------------------|                                          |
- * sh ---------------------------------------------|                                          |
- * queuesize --------------------------------------|                                          |
- *
- * Note on Queues :
- * Ideally we would want kernel_scene_intersect to work on queues.
- * But during the very first time, the queues will be empty and hence we perform a direct mapping
- * between ray-index and thread-index; From the next time onward, the queue will be filled and
- * we may start operating on queues.
- *
- * State of queue during the first time this kernel is called :
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
- *
- * State of queues during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
- * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
- * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
- * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
- * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
- * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ * This kernel determines the rays that have hit the background and changes
+ * their ray state to RAY_HIT_BACKGROUND.
  */
-
-ccl_device void kernel_scene_intersect(
-        KernelGlobals *kg,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int ray_index)
+ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 {
+	/* Fetch use_queues_flag */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
+
 	/* All regenerated rays become active here */
-	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
 
-	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
 		return;
 
 #ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
+	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
 #endif
-	Intersection *isect = &Intersection_coop[ray_index];
-	PathState state = PathState_coop[ray_index];
-	Ray ray = Ray_coop[ray_index];
+	Intersection isect;
+	PathState state = kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.ray[ray_index];
 
 	/* intersect scene */
 	uint visibility = path_state_ray_visibility(kg, &state);
@@ -96,7 +62,7 @@ ccl_device void kernel_scene_intersect(
 #ifdef __HAIR__
 	float difl = 0.0f, extmax = 0.0f;
 	uint lcg_state = 0;
-	RNG rng = rng_coop[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
 
 	if(kernel_data.bvh.have_curves) {
 		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
@@ -106,19 +72,25 @@ ccl_device void kernel_scene_intersect(
 		}
 
 		extmax = kernel_data.curve.maximum_width;
-		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+		lcg_state = lcg_state_init(&rng, state.rng_offset, state.sample, 0x51633e2d);
+	}
+
+	if(state.bounce > kernel_data.integrator.ao_bounces) {
+		visibility = PATH_RAY_SHADOW;
+		ray.t = kernel_data.background.ao_distance;
 	}
 
-	bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax);
+	bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-	bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f);
+	bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif
+	kernel_split_state.isect[ray_index] = isect;
 
 #ifdef __KERNEL_DEBUG__
 	if(state.flag & PATH_RAY_CAMERA) {
-		debug_data->num_bvh_traversed_nodes += isect->num_traversed_nodes;
-		debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
-		debug_data->num_bvh_intersections += isect->num_intersections;
+		debug_data->num_bvh_traversed_nodes += isect.num_traversed_nodes;
+		debug_data->num_bvh_traversed_instances += isect.num_traversed_instances;
+		debug_data->num_bvh_intersections += isect.num_intersections;
 	}
 	debug_data->num_ray_bounces++;
 #endif
@@ -128,6 +100,8 @@ ccl_device void kernel_scene_intersect(
 		 * These rays undergo special processing in the
 		 * background_bufferUpdate kernel.
 		 */
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index cef64bf5f36..0f1696e34a0 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -14,57 +14,58 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_shader_eval kernel
- * This kernel is the 5th kernel in the ray tracing logic. This is
- * the 4rd kernel in path iteration. This kernel sets up the ShaderData
- * structure from the values computed by the previous kernels. It also identifies
- * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
  *
- * The input and output of the kernel is as follows,
- * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd
- * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Intersection_coop ----------------------------------|                         |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
- * ray_state ------------------------------------------|                         |
- * kg (globals) ---------------------------------------|                         |
- * queuesize ------------------------------------------|                         |
- *
- * Note on Queues :
- * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE;
- * State of queues when this kernel is called,
- * at entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * at exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
  */
-ccl_device void kernel_shader_eval(
-        KernelGlobals *kg,
-        ShaderData *sd,                        /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_shader_eval(KernelGlobals *kg,
+                                   ccl_local_param unsigned int *local_queue_atomics)
 {
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		Intersection *isect = &Intersection_coop[ray_index];
-		ccl_global uint *rng = &rng_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	char enqueue_flag = 0;
+	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
+		enqueue_flag = 1;
+	}
+
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+	/* Continue on with shader evaluation. */
+	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		Intersection isect = kernel_split_state.isect[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
 
 		shader_setup_from_ray(kg,
-		                      sd,
-		                      isect,
+		                      &kernel_split_state.sd[ray_index],
+		                      &isect,
 		                      &ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+		float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+		kernel_split_state.rng[ray_index] = rng;
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
deleted file mode 100644
index 6153af47f96..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_shadow_blocked kernel.
- * This is the ninth kernel in the ray tracing logic. This is the eighth
- * of the path iteration kernels. This kernel takes care of "shadow ray cast"
- * logic of the direct lighting and AO  part of ray tracing.
- *
- * The input and output are as follows,
- *
- * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
- * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
- * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
- * ray_state ---------------------------------------|                            |--- ray_state
- * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * kg (globals) ------------------------------------|                            |
- * queuesize ---------------------------------------|                            |
- *
- * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself.
- * Note on queues :
- * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
- * these queues this kernel.
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
- * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
- */
-ccl_device void kernel_shadow_blocked(
-        KernelGlobals *kg,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        char shadow_blocked_type,
-        int ray_index)
-{
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
-
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
-		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
-
-		ccl_global Ray *light_ray_global =
-		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
-		                ? light_ray_ao_global
-		                : light_ray_dl_global;
-
-		float3 shadow;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        kg->sd_input,
-		                                        state,
-		                                        light_ray_global,
-		                                        &shadow));
-
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
-	}
-}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
new file mode 100644
index 00000000000..4243e18de72
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for AO. */
+ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
+{
+	unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < ao_queue_length) {
+		ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	/* Flag determining if we need to update L. */
+	char update_path_radiance = 0;
+
+	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
+
+		float3 shadow;
+		Ray ray = *light_ray_global;
+		update_path_radiance = !(shadow_blocked(kg,
+		                                        &kernel_split_state.sd_DL_shadow[ray_index],
+		                                        state,
+		                                        &ray,
+		                                        &shadow));
+
+		*light_ray_global = ray;
+		/* We use light_ray_global's P and t to store shadow and
+		 * update_path_radiance.
+		 */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
new file mode 100644
index 00000000000..bb8f0157965
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for direct visible light. */
+ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
+{
+	unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < dl_queue_length) {
+		ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	/* Flag determining if we need to update L. */
+	char update_path_radiance = 0;
+
+	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+
+		float3 shadow;
+		Ray ray = *light_ray_global;
+		update_path_radiance = !(shadow_blocked(kg,
+		                                        &kernel_split_state.sd_DL_shadow[ray_index],
+		                                        state,
+		                                        &ray,
+		                                        &shadow));
+
+		*light_ray_global = ray;
+		/* We use light_ray_global's P and t to store shadow and
+		 * update_path_radiance.
+		 */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 2135ee22b2e..4303ba0a905 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,48 +17,61 @@
 #ifndef  __KERNEL_SPLIT_H__
 #define  __KERNEL_SPLIT_H__
 
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_image_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
 
-#include "util_atomic.h"
+#include "kernel/split/kernel_split_data.h"
 
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
+#include "kernel/kernel_globals.h"
 
-#include "geom/geom.h"
-#include "bvh/bvh.h"
+#ifdef __OSL__
+#  include "kernel/osl/osl_shader.h"
+#endif
+
+#ifdef __KERNEL_OPENCL__
+#  include "kernel/kernel_image_opencl.h"
+#endif
+#ifdef __KERNEL_CPU__
+#  include "kernel/kernels/cpu/kernel_cpu_image.h"
+#endif
+
+#include "util/util_atomic.h"
+
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_camera.h"
+
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
 
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
 
 #ifdef __SUBSURFACE__
-#include "kernel_subsurface.h"
+#  include "kernel/kernel_subsurface.h"
 #endif
 
 #ifdef __VOLUME__
-#include "kernel_volume.h"
+#  include "kernel/kernel_volume.h"
 #endif
 
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shadow.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_path_common.h"
+#include "kernel/kernel_path_surface.h"
+#include "kernel/kernel_path_volume.h"
+#include "kernel/kernel_path_subsurface.h"
 
 #ifdef __KERNEL_DEBUG__
-#include "kernel_debug.h"
+#  include "kernel/kernel_debug.h"
 #endif
 
-#include "kernel_queues.h"
-#include "kernel_work_stealing.h"
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
 
 #endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
new file mode 100644
index 00000000000..17e6587883a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_H__
+#define __KERNEL_SPLIT_DATA_H__
+
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	uint64_t size = 0;
+#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
+	size = size SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+#ifdef __SUBSURFACE__
+	size += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); /* ss_rays */
+#endif
+
+#ifdef __VOLUME__
+	size += align_up(2 * num_elements * sizeof(PathState), 16); /* state_shadow */
+#endif
+
+	return size;
+}
+
+ccl_device_inline void split_data_init(KernelGlobals *kg,
+                                       ccl_global SplitData *split_data,
+                                       size_t num_elements,
+                                       ccl_global void *data,
+                                       ccl_global char *ray_state)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	ccl_global char *p = (ccl_global char*)data;
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+	split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16);
+	SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+#ifdef __SUBSURFACE__
+	split_data->ss_rays = (ccl_global SubsurfaceIndirectRays*)p;
+	p += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16);
+#endif
+
+#ifdef __VOLUME__
+	split_data->state_shadow = (ccl_global PathState*)p;
+	p += align_up(2 * num_elements * sizeof(PathState), 16);
+#endif
+
+	split_data->ray_state = ray_state;
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
new file mode 100644
index 00000000000..748197b7183
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
+#define __KERNEL_SPLIT_DATA_TYPES_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
+
+typedef struct SplitParams {
+	int x;
+	int y;
+	int w;
+	int h;
+
+	int offset;
+	int stride;
+
+	ccl_global uint *rng_state;
+
+	int start_sample;
+	int end_sample;
+
+	ccl_global unsigned int *work_pools;
+	unsigned int num_samples;
+
+	ccl_global int *queue_index;
+	int queue_size;
+	ccl_global char *use_queues_flag;
+
+	ccl_global float *buffer;
+} SplitParams;
+
+/* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+
+/* SPLIT_DATA_ENTRY(type, name, num) */
+
+#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__)
+/* DebugData memory */
+#  define SPLIT_DATA_DEBUG_ENTRIES \
+	SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
+#else
+#  define SPLIT_DATA_DEBUG_ENTRIES
+#endif
+
+#define SPLIT_DATA_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
+	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_DEBUG_ENTRIES \
+
+/* struct that holds pointers to data in the shared state buffer */
+typedef struct SplitData {
+#define SPLIT_DATA_ENTRY(type, name, num) type *name;
+	SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+#ifdef __SUBSURFACE__
+	ccl_global SubsurfaceIndirectRays *ss_rays;
+#endif
+
+#ifdef __VOLUME__
+	ccl_global PathState *state_shadow;
+#endif
+
+	/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
+	 * the host easily) but is still used the same as the other data so we have it here in this struct as well
+	 */
+	ccl_global char *ray_state;
+} SplitData;
+
+#ifndef __KERNEL_CUDA__
+#  define kernel_split_state (kg->split_data)
+#  define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+#  define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+#  define kernel_split_params (__split_param_data)
+#endif  /* __KERNEL_CUDA__ */
+
+/* Local storage for queue_enqueue kernel. */
+typedef struct QueueEnqueueLocals {
+	uint queue_atomics[2];
+} QueueEnqueueLocals;
+
+/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
+typedef struct BackgroundAOLocals {
+	uint queue_atomics_bg;
+	uint queue_atomics_ao;
+} BackgroundAOLocals;
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
new file mode 100644
index 00000000000..0b4d50c70ee
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
+                                          ccl_local_param unsigned int* local_queue_atomics)
+{
+#ifdef __SUBSURFACE__
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+	char enqueue_flag = 0;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		if(sd->flag & SD_BSSRDF) {
+			if(kernel_path_subsurface_scatter(kg,
+			                                  sd,
+			                                  emission_sd,
+			                                  L,
+			                                  state,
+			                                  &rng,
+			                                  ray,
+			                                  throughput,
+			                                  ss_indirect)) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				enqueue_flag = 1;
+			}
+		}
+		kernel_split_state.rng[ray_index] = rng;
+	}
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#endif  /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644
index a21e9b6a0b1..00000000000
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../kernel_compat_opencl.h"
-#include "../kernel_math.h"
-#include "../kernel_types.h"
-#include "../kernel_globals.h"
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-ccl_device void kernel_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	if(x < sw && y < sh) {
-		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
-		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
-
-		int sample_stride = (data->film.pass_stride);
-
-		int sample_iterator = 0;
-		int pass_stride_iterator = 0;
-		int num_floats = data->film.pass_stride;
-
-		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
-			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
-				*(buffer + pass_stride_iterator) =
-				        (start_sample == 0 && sample_iterator == 0)
-				                ? *(per_sample_output_buffer + pass_stride_iterator)
-				                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
-			}
-			per_sample_output_buffer += sample_stride;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 88ec7fe6fcc..d748e76fa80 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -39,7 +39,7 @@
  * mostly taken care of in the SVM compiler.
  */
 
-#include "svm_types.h"
+#include "kernel/svm/svm_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -139,49 +139,49 @@ CCL_NAMESPACE_END
 
 /* Nodes */
 
-#include "svm_noise.h"
+#include "kernel/svm/svm_noise.h"
 #include "svm_texture.h"
 
-#include "svm_color_util.h"
-#include "svm_math_util.h"
-
-#include "svm_attribute.h"
-#include "svm_gradient.h"
-#include "svm_blackbody.h"
-#include "svm_closure.h"
-#include "svm_noisetex.h"
-#include "svm_convert.h"
-#include "svm_displace.h"
-#include "svm_fresnel.h"
-#include "svm_wireframe.h"
-#include "svm_wavelength.h"
-#include "svm_camera.h"
-#include "svm_geometry.h"
-#include "svm_hsv.h"
-#include "svm_image.h"
-#include "svm_gamma.h"
-#include "svm_brightness.h"
-#include "svm_invert.h"
-#include "svm_light_path.h"
-#include "svm_magic.h"
-#include "svm_mapping.h"
-#include "svm_normal.h"
-#include "svm_wave.h"
-#include "svm_math.h"
-#include "svm_mix.h"
-#include "svm_ramp.h"
-#include "svm_sepcomb_hsv.h"
-#include "svm_sepcomb_vector.h"
-#include "svm_musgrave.h"
-#include "svm_sky.h"
-#include "svm_tex_coord.h"
-#include "svm_value.h"
-#include "svm_voronoi.h"
-#include "svm_checker.h"
-#include "svm_brick.h"
-#include "svm_vector_transform.h"
-#include "svm_voxel.h"
-#include "svm_bump.h"
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_math_util.h"
+
+#include "kernel/svm/svm_attribute.h"
+#include "kernel/svm/svm_gradient.h"
+#include "kernel/svm/svm_blackbody.h"
+#include "kernel/svm/svm_closure.h"
+#include "kernel/svm/svm_noisetex.h"
+#include "kernel/svm/svm_convert.h"
+#include "kernel/svm/svm_displace.h"
+#include "kernel/svm/svm_fresnel.h"
+#include "kernel/svm/svm_wireframe.h"
+#include "kernel/svm/svm_wavelength.h"
+#include "kernel/svm/svm_camera.h"
+#include "kernel/svm/svm_geometry.h"
+#include "kernel/svm/svm_hsv.h"
+#include "kernel/svm/svm_image.h"
+#include "kernel/svm/svm_gamma.h"
+#include "kernel/svm/svm_brightness.h"
+#include "kernel/svm/svm_invert.h"
+#include "kernel/svm/svm_light_path.h"
+#include "kernel/svm/svm_magic.h"
+#include "kernel/svm/svm_mapping.h"
+#include "kernel/svm/svm_normal.h"
+#include "kernel/svm/svm_wave.h"
+#include "kernel/svm/svm_math.h"
+#include "kernel/svm/svm_mix.h"
+#include "kernel/svm/svm_ramp.h"
+#include "kernel/svm/svm_sepcomb_hsv.h"
+#include "kernel/svm/svm_sepcomb_vector.h"
+#include "kernel/svm/svm_musgrave.h"
+#include "kernel/svm/svm_sky.h"
+#include "kernel/svm/svm_tex_coord.h"
+#include "kernel/svm/svm_value.h"
+#include "kernel/svm/svm_voronoi.h"
+#include "kernel/svm/svm_checker.h"
+#include "kernel/svm/svm_brick.h"
+#include "kernel/svm/svm_vector_transform.h"
+#include "kernel/svm/svm_voxel.h"
+#include "kernel/svm/svm_bump.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -192,7 +192,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = ccl_fetch(sd, shader) & SHADER_MASK;
+	int offset = sd->shader & SHADER_MASK;
 
 	while(1) {
 		uint4 node = read_node(kg, &offset);
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 0e55c99ae97..229a3f20421 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData
 
 	AttributeDescriptor desc;
 
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+	if(sd->object != OBJECT_NONE) {
 		desc = find_attribute(kg, sd, node.y);
 		if(desc.offset == ATTR_STD_NOT_FOUND) {
 			desc = attribute_not_found();
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index 04a8c7b64e5..610d9af9e1f 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* save state */
-	stack_store_float3(stack, offset+0, ccl_fetch(sd, P));
-	stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx);
-	stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy);
+	stack_store_float3(stack, offset+0, sd->P);
+	stack_store_float3(stack, offset+3, sd->dP.dx);
+	stack_store_float3(stack, offset+6, sd->dP.dy);
 
 	/* set state as if undisplaced */
 	const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED);
@@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa
 		object_dir_transform(kg, sd, &dPdx);
 		object_dir_transform(kg, sd, &dPdy);
 
-		ccl_fetch(sd, P) = P;
-		ccl_fetch(sd, dP).dx = dPdx;
-		ccl_fetch(sd, dP).dy = dPdy;
+		sd->P = P;
+		sd->dP.dx = dPdx;
+		sd->dP.dy = dPdy;
 	}
 }
 
 ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* restore state */
-	ccl_fetch(sd, P) = stack_load_float3(stack, offset+0);
-	ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3);
-	ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6);
+	sd->P = stack_load_float3(stack, offset+0);
+	sd->dP.dx = stack_load_float3(stack, offset+3);
+	sd->dP.dy = stack_load_float3(stack, offset+6);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 00678a49d70..90249dfd978 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;
 
 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, ccl_fetch(sd, P));
+	vector = transform_point(&tfm, sd->P);
 	zdepth = vector.z;
 	distance = len(vector);
 
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 024d7d6447a..9a3689a94f4 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = eta;
-			ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+			sd->flag |= bsdf_refraction_setup(bsdf);
 		}
 		else {
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = 0.0f;
-			ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+			sd->flag |= bsdf_reflection_setup(bsdf);
 		}
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
@@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 	}
 	else {
 		bsdf->alpha_x = roughness;
@@ -50,9 +50,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 	}
 }
 
@@ -70,7 +70,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	if(mix_weight == 0.0f)
 		return;
 
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N;
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
@@ -110,10 +110,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F);
 
 			/* calculate ior */
-			float ior = (ccl_fetch(sd, flag) & SD_BACKFACING) ? 1.0f / eta : eta;
+			float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta;
 
 			// calculate fresnel for refraction
-			float cosNO = dot(N, ccl_fetch(sd, I));
+			float cosNO = dot(N, sd->I);
 			float fresnel = fresnel_dielectric_cos(cosNO, ior);
 
 			// calculate weights of the diffuse and specular part
@@ -129,7 +129,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 			// get the additional clearcoat normal and subsurface scattering radius
 			uint4 data_cn_ssr = read_node(kg, offset);
-			float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : ccl_fetch(sd, N);
+			float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N;
 			float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f);
 
 			// get the subsurface color
@@ -137,7 +137,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) :
 				make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w));
 
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 
 #ifdef __SUBSURFACE__
 			float3 albedo = subsurface_color * subsurface + base_color * (1.0f - subsurface);
@@ -163,7 +163,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bsdf->roughness = roughness;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bsdf_principled_diffuse_setup(bsdf);
+						sd->flag |= bsdf_principled_diffuse_setup(bsdf);
 					}
 				}
 				else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) {
@@ -186,7 +186,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->roughness = roughness;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
 					}
 
 					bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f));
@@ -200,7 +200,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->roughness = roughness;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
 					}
 
 					bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z));
@@ -214,7 +214,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->roughness = roughness;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
 					}
 				}
 			}
@@ -230,7 +230,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->roughness = roughness;
 
 					/* setup bsdf */
-					ccl_fetch(sd, flag) |= bsdf_principled_diffuse_setup(bsdf);
+					sd->flag |= bsdf_principled_diffuse_setup(bsdf);
 				}
 			}
 #endif
@@ -251,7 +251,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->N = N;
 
 					/* setup bsdf */
-					ccl_fetch(sd, flag) |= bsdf_principled_sheen_setup(bsdf);
+					sd->flag |= bsdf_principled_sheen_setup(bsdf);
 				}
 			}
 
@@ -286,9 +286,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 						/* setup bsdf */
 						if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
-							ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf);
 						else /* use multi-scatter GGX */
-							ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf);
 					}
 				}
 #ifdef __CAUSTICS_TRICKS__
@@ -326,7 +326,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 								bsdf->extra->cspec0 = cspec0;
 
 								/* setup bsdf */
-								ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_fresnel_setup(bsdf);
+								sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf);
 							}
 						}
 
@@ -350,7 +350,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 								bsdf->ior = ior;
 
 								/* setup bsdf */
-								ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+								sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 							}
 						}
 					}
@@ -371,7 +371,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 							bsdf->extra->cspec0 = cspec0;
 
 							/* setup bsdf */
-							ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf);
 						}
 					}
 				}
@@ -399,7 +399,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bsdf->extra->clearcoat = clearcoat;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_clearcoat_setup(bsdf);
+						sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf);
 					}
 				}
 #ifdef __CAUSTICS_TRICKS__
@@ -409,7 +409,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			break;
 		}
 		case CLOSURE_BSDF_DIFFUSE_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
 
 			if(bsdf) {
@@ -418,31 +418,31 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				float roughness = param1;
 
 				if(roughness == 0.0f) {
-					ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
+					sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
 				}
 				else {
 					bsdf->roughness = roughness;
-					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf);
+					sd->flag |= bsdf_oren_nayar_setup(bsdf);
 				}
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSLUCENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
-				ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf);
+				sd->flag |= bsdf_translucent_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSPARENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 			if(bsdf) {
-				ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+				sd->flag |= bsdf_transparent_setup(bsdf);
 			}
 			break;
 		}
@@ -455,7 +455,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -467,21 +467,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+					sd->flag |= bsdf_reflection_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
 					kernel_assert(stack_valid(data_node.z));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.z);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf);
+						sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf);
 			}
 
 			break;
@@ -493,7 +493,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -501,7 +501,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->extra = NULL;
 
 				float eta = fmaxf(param2, 1e-5f);
-				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -509,7 +509,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->alpha_y = 0.0f;
 					bsdf->ior = eta;
 
-					ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+					sd->flag |= bsdf_refraction_setup(bsdf);
 				}
 				else {
 					bsdf->alpha_x = param1;
@@ -517,9 +517,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->ior = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 					else
-						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 				}
 			}
 
@@ -535,14 +535,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 			}
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 			/* fresnel */
-			float cosNO = dot(N, ccl_fetch(sd, I));
+			float cosNO = dot(N, sd->I);
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;
 
@@ -581,7 +581,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 
@@ -593,13 +593,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->alpha_x = param1;
 				bsdf->alpha_y = param1;
 				float eta = fmaxf(param2, 1e-5f);
-				bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				kernel_assert(stack_valid(data_node.z));
 				bsdf->extra->color = stack_load_float3(stack, data_node.z);
 
 				/* setup bsdf */
-				ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
+				sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
 			}
 
 			break;
@@ -612,7 +612,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -642,33 +642,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->ior = 0.0f;
 
 				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) {
 					kernel_assert(stack_valid(data_node.w));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.w);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
+						sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
 
 				bsdf->sigma = saturate(param1);
-				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf);
+				sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf);
 			}
 			break;
 		}
@@ -678,7 +678,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 #endif
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight);
 
 			if(bsdf) {
@@ -687,18 +687,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->smooth = param2;
 				
 				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf);
+					sd->flag |= bsdf_diffuse_toon_setup(bsdf);
 				else
-					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf);
+					sd->flag |= bsdf_glossy_toon_setup(bsdf);
 			}
 			break;
 		}
 #ifdef __HAIR__
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			
-			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 				if(bsdf) {
@@ -708,7 +708,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					 * better figure out a way to skip backfaces from rays
 					 * spawned by transmission from the front */
 					bsdf->weight = make_float3(1.0f, 1.0f, 1.0f);
-					ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+					sd->flag |= bsdf_transparent_setup(bsdf);
 				}
 			}
 			else {
@@ -722,18 +722,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					if(stack_valid(data_node.y)) {
 						bsdf->T = normalize(stack_load_float3(stack, data_node.y));
 					}
-					else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
-						bsdf->T = normalize(ccl_fetch(sd, dPdv));
+					else if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
+						bsdf->T = normalize(sd->dPdv);
 						bsdf->offset = 0.0f;
 					}
 					else
-						bsdf->T = normalize(ccl_fetch(sd, dPdu));
+						bsdf->T = normalize(sd->dPdu);
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf);
+						sd->flag |= bsdf_hair_reflection_setup(bsdf);
 					}
 					else {
-						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf);
+						sd->flag |= bsdf_hair_transmission_setup(bsdf);
 					}
 				}
 			}
@@ -746,8 +746,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID:
 		case CLOSURE_BSSRDF_BURLEY_ID: {
-			float3 albedo = ccl_fetch(sd, svm_closure_weight);
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 albedo = sd->svm_closure_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
 			/* disable in case of diffuse ancestor, can't see it well then and
@@ -773,7 +773,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.x;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
 				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -784,7 +784,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.y;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
 				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -795,7 +795,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.z;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 			}
 
@@ -825,21 +825,21 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 
 	switch(type) {
 		case CLOSURE_VOLUME_ABSORPTION_ID: {
-			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density;
+			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sd->svm_closure_weight) * mix_weight * density;
 			ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight);
 
 			if(sc) {
-				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
+				sd->flag |= volume_absorption_setup(sc);
 			}
 			break;
 		}
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density;
+			float3 weight = sd->svm_closure_weight * mix_weight * density;
 			HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight);
 
 			if(volume) {
 				volume->g = param2; /* g */
-				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume);
+				sd->flag |= volume_henyey_greenstein_setup(volume);
 			}
 			break;
 		}
@@ -859,12 +859,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_EMISSION;
+	sd->flag |= SD_EMISSION;
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -877,10 +877,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight);
 }
 
 ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
@@ -893,12 +893,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_HOLDOUT;
+	sd->flag |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -911,19 +911,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_AO;
+	sd->flag |= SD_AO;
 }
 
 /* Closure Nodes */
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	ccl_fetch(sd, svm_closure_weight) = weight;
+	sd->svm_closure_weight = weight;
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -973,7 +973,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	ccl_fetch(sd, N) = normal;
+	sd->N = normal;
 	stack_store_float3(stack, out_normal, normal);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 890ab41aaaa..c94fa130af7 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -25,10 +25,10 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	uint normal_offset, distance_offset, invert, use_object_space;
 	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space);
 
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 
-	float3 dPdx = ccl_fetch(sd, dP).dx;
-	float3 dPdy = ccl_fetch(sd, dP).dy;
+	float3 dPdx = sd->dP.dx;
+	float3 dPdy = sd->dP.dy;
 
 	if(use_object_space) {
 		object_inverse_normal_transform(kg, sd, &normal_in);
@@ -80,14 +80,14 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, flo
 {
 	float d = stack_load_float(stack, fac_offset);
 
-	float3 dP = ccl_fetch(sd, N);
+	float3 dP = sd->N;
 	object_inverse_normal_transform(kg, sd, &dP);
 
 	dP *= d*0.1f; /* todo: get rid of this factor */
 
 	object_dir_transform(kg, sd, &dP);
 
-	ccl_fetch(sd, P) += dP;
+	sd->P += dP;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 23c97d80cb0..3703ec55015 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
-	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 
 	stack_store_float(stack, out_offset, f);
 }
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
 
 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
 
 	float f;
 
 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
 
-		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(ccl_fetch(sd, I), normal_in));
+		f = fabsf(dot(sd->I, normal_in));
 
 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 7d512f7ff4d..4a09d9f6653 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -27,15 +27,15 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
-		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
+		case NODE_GEOM_P: data = sd->P; break;
+		case NODE_GEOM_N: data = sd->N; break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
-		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
+		case NODE_GEOM_I: data = sd->I; break;
+		case NODE_GEOM_Ng: data = sd->Ng; break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
 #endif
 	}
 
@@ -48,8 +48,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -65,8 +65,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -87,9 +87,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
 		default: data = 0.0f; break;
 	}
 
@@ -106,44 +106,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg,
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
+			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
@@ -177,7 +177,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = ccl_fetch(sd, curve_transparency);
+			data = sd->curve_transparency;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 2afdf61b476..76acc9253a1 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -144,7 +144,6 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
 		case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
 		case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
-		case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
 		default:
 			kernel_assert(0);
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -238,9 +237,9 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = ccl_fetch(sd, N);
+	float3 N = sd->N;
 
-	N = ccl_fetch(sd, N);
+	N = sd->N;
 	object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 04f6f623f18..1492e358608 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -31,8 +31,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = sd->ray_length; break;
 		case NODE_LP_ray_depth: info = (float)state->bounce; break;
 		case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break;
 		case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break;
@@ -56,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 
 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
 	}
 
 	float smooth = stack_load_float(stack, smooth_offset);
 
 	if(smooth > 0.0f) {
-		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
+		float squared = sd->ray_length*sd->ray_length;
 		/* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */
 		if(isfinite(squared)) {
 			strength *= squared/(smooth + squared);
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index 01547b60014..a7f15de7325 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -134,32 +134,37 @@ ccl_device float3 svm_math_blackbody_color(float t) {
 		{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
 	};
 
-	if(t >= 12000.0f)
+	int i;
+	if(t >= 12000.0f) {
 		return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
+	}
+	else if(t >= 6365.0f) {
+		i = 5;
+	}
+	else if(t >= 3315.0f) {
+		i = 4;
+	}
+	else if(t >= 1902.0f) {
+		i = 3;
+	}
+	else if(t >= 1449.0f) {
+		i = 2;
+	}
+	else if(t >= 1167.0f) {
+		i = 1;
+	}
+	else if(t >= 965.0f) {
+		i = 0;
+	}
+	else {
+		/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
+		return make_float3(4.70366907f, 0.0f, 0.0f);
+	}
 
-	/* Define a macro to reduce stack usage for nvcc */
-#define MAKE_BB_RGB(i) make_float3(\
-		rc[i][0] / t + rc[i][1] * t + rc[i][2],\
-		gc[i][0] / t + gc[i][1] * t + gc[i][2],\
-		((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3])
-
-	if(t >= 6365.0f)
-		return MAKE_BB_RGB(5);
-	if(t >= 3315.0f)
-		return MAKE_BB_RGB(4);
-	if(t >= 1902.0f)
-		return MAKE_BB_RGB(3);
-	if(t >= 1449.0f)
-		return MAKE_BB_RGB(2);
-	if(t >= 1167.0f)
-		return MAKE_BB_RGB(1);
-	if(t >= 965.0f)
-		return MAKE_BB_RGB(0);
-
-#undef MAKE_BB_RGB
-
-	/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
-	return make_float3(4.70366907f, 0.0f, 0.0f);
+	const float t_inv = 1.0f / t;
+	return make_float3(rc[i][0] * t_inv + rc[i][1] * t + rc[i][2],
+	                   gc[i][0] * t_inv + gc[i][1] * t + gc[i][2],
+	                   ((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3]);
 }
 
 ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma)
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 62ff38cf1c5..0347ab7b193 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -18,50 +18,42 @@ CCL_NAMESPACE_BEGIN
 
 /* Noise */
 
-ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color)
-{
-	int hard = 0;
-
-	if(distortion != 0.0f) {
-		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
-
-		r.x = noise(p + offset) * distortion;
-		r.y = noise(p) * distortion;
-		r.z = noise(p - offset) * distortion;
-
-		p += r;
-	}
-
-	*fac = noise_turbulence(p, detail, hard);
-	*color = make_float3(*fac,
-		noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
-		noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
-}
-
 ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint co_offset, scale_offset, detail_offset, distortion_offset, fac_offset, color_offset;
 
 	decode_node_uchar4(node.y, &co_offset, &scale_offset, &detail_offset, &distortion_offset);
+	decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
 
 	uint4 node2 = read_node(kg, offset);
 
 	float scale = stack_load_float_default(stack, scale_offset, node2.x);
 	float detail = stack_load_float_default(stack, detail_offset, node2.y);
 	float distortion = stack_load_float_default(stack, distortion_offset, node2.z);
-	float3 co = stack_load_float3(stack, co_offset);
+	float3 p = stack_load_float3(stack, co_offset) * scale;
+	int hard = 0;
 
-	float3 color;
-	float f;
+	if(distortion != 0.0f) {
+		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
+
+		r.x = noise(p + offset) * distortion;
+		r.y = noise(p) * distortion;
+		r.z = noise(p - offset) * distortion;
 
-	svm_noise(co*scale, detail, distortion, &f, &color);
+		p += r;
+	}
 
-	decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
+	float f = noise_turbulence(p, detail, hard);
 
-	if(stack_valid(fac_offset))
+	if(stack_valid(fac_offset)) {
 		stack_store_float(stack, fac_offset, f);
-	if(stack_valid(color_offset))
+	}
+	if(stack_valid(color_offset)) {
+		float3 color = make_float3(f,
+			noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
+			noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
 		stack_store_float3(stack, color_offset, color);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index c0b01262212..c94327401f5 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -48,47 +48,47 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P));
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
+				data = transform_point(&tfm, sd->P + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
+				data = camera_world_to_ndc(kg, sd, sd->P);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -112,9 +112,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -129,47 +129,47 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dx);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -196,9 +196,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -213,47 +213,47 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dy);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -274,12 +274,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 	float3 color = stack_load_float3(stack, color_offset);
 	color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f);
 
-	bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0;
+	bool is_backfacing = (sd->flag & SD_BACKFACING) != 0;
 	float3 N;
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(ccl_fetch(sd, object) == OBJECT_NONE) {
+		if(sd->object == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@@ -299,11 +299,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL);
 		float3 normal;
 
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL);
 		}
 		else {
-			normal = ccl_fetch(sd, Ng);
+			normal = sd->Ng;
 
 			/* the normal is already inverted, which is too soon for the math here */
 			if(is_backfacing) {
@@ -345,11 +345,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
+		N = safe_normalize(sd->N + (N - sd->N)*strength);
 	}
 
 	if(is_zero(N)) {
-		N = ccl_fetch(sd, N);
+		N = sd->N;
 	}
 
 	stack_store_float3(stack, normal_offset, N);
@@ -377,7 +377,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;
 
 		if(desc.offset == ATTR_STD_NOT_FOUND)
-			generated = ccl_fetch(sd, P);
+			generated = sd->P;
 		else
 			generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
@@ -390,7 +390,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}
 
 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
+	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
 	stack_store_float3(stack, tangent_offset, tangent);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 4c32130d06d..4e92f27acdb 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
+	bool is_object = (sd->object != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index a8b3604a8a7..9e826c8c23f 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -46,7 +46,7 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
 #  if defined(__KERNEL_CUDA__)
 #    if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
-	if(id < 2048) /* TODO(dingto): Make this a variable */
+	if(id < TEX_START_HALF4_CUDA_KEPLER)
 		r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
 	else {
 		float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 87e40791333..3c6353c8001 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
                                   float3 *P)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
+	if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		/* Triangles */
 		int np = 3;
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
+		if(sd->type & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, sd->prim, Co);
 		else
-			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
+			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
 
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
@@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
-			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
+			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
@@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
 	 * With OpenCL 2.0 it's possible to avoid this change, but for until
 	 * then we'll be living with such an exception.
 	 */
-	float3 P = ccl_fetch(sd, P);
+	float3 P = sd->P;
 	float f = wireframe(kg, sd, size, pixel_size, &P);
 #else
-	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+	float f = wireframe(kg, sd, size, pixel_size, &sd->P);
 #endif
 
 	/* TODO(sergey): Think of faster way to calculate derivatives. */
 	if(bump_offset == NODE_BUMP_OFFSET_DX) {
-		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
+		float3 Px = sd->P - sd->dP.dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
 	}
 	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
-		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
+		float3 Py = sd->P - sd->dP.dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
 	}
 
 	if(stack_valid(out_fac))
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 8eaa9de3874..17ac66644e2 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -1,14 +1,6 @@
 
 set(INC
-	.
-	../device
-	../graph
-	../kernel
-	../kernel/svm
-	../kernel/osl
-	../bvh
-	../subd
-	../util
+	..
 	../../glew-mx
 )
 
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index c0d429a583c..e157a385904 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "image.h"
-#include "mesh.h"
-#include "attribute.h"
+#include "render/image.h"
+#include "render/mesh.h"
+#include "render/attribute.h"
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index f4538c76369..a64eb6542d5 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -17,12 +17,12 @@
 #ifndef __ATTRIBUTE_H__
 #define __ATTRIBUTE_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_list.h"
-#include "util_param.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_param.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index 8d7d7b847fd..930debe1e33 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "device.h"
-#include "integrator.h"
-#include "graph.h"
-#include "nodes.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "util_foreach.h"
-#include "util_math.h"
-#include "util_types.h"
+#include "render/background.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index 8029c6a9e80..db20b6ebf87 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -17,9 +17,9 @@
 #ifndef __BACKGROUND_H__
 #define __BACKGROUND_H__
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -30,7 +30,7 @@ class Shader;
 
 class Background : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	float ao_factor;
 	float ao_distance;
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index d9a297002c6..c0fcd517390 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "bake.h"
-#include "integrator.h"
+#include "render/bake.h"
+#include "render/integrator.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -171,9 +171,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 		/* needs to be up to data for attribute access */
 		device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-		device->mem_alloc(d_input, MEM_READ_ONLY);
+		device->mem_alloc("bake_input", d_input, MEM_READ_ONLY);
 		device->mem_copy_to(d_input);
-		device->mem_alloc(d_output, MEM_READ_WRITE);
+		device->mem_alloc("bake_output", d_output, MEM_READ_WRITE);
 
 		DeviceTask task(DeviceTask::SHADER);
 		task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 25f5eb3c897..ceb94cfb682 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -17,11 +17,11 @@
 #ifndef __BAKE_H__
 #define __BAKE_H__
 
-#include "device.h"
-#include "scene.h"
+#include "device/device.h"
+#include "render/scene.h"
 
-#include "util_progress.h"
-#include "util_vector.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -73,7 +73,7 @@ public:
 
 	bool need_update;
 
-	int total_pixel_samples;
+	size_t total_pixel_samples;
 
 private:
 	BakeData *m_bake_data;
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index f1692712d61..fe2c2e78926 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -16,17 +16,17 @@
 
 #include <stdlib.h>
 
-#include "buffers.h"
-#include "device.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_hash.h"
-#include "util_image.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_time.h"
-#include "util_types.h"
+#include "render/buffers.h"
+#include "device/device.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_image.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_time.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -129,13 +129,13 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	
 	/* allocate buffer */
 	buffer.resize(params.width*params.height*params.get_passes_size());
-	device->mem_alloc(buffer, MEM_READ_WRITE);
+	device->mem_alloc("render_buffer", buffer, MEM_READ_WRITE);
 	device->mem_zero(buffer);
 
 	/* allocate rng state */
 	rng_state.resize(params.width, params.height);
 
-	device->mem_alloc(rng_state, MEM_READ_WRITE);
+	device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
 }
 
 bool RenderBuffers::copy_from_device()
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index c9c2a21079a..5c78971678a 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -17,16 +17,16 @@
 #ifndef __BUFFERS_H__
 #define __BUFFERS_H__
 
-#include "device_memory.h"
+#include "device/device_memory.h"
 
-#include "film.h"
+#include "render/film.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_half.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index c8c51ec96d2..83ff8a10618 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "tables.h"
-
-#include "device.h"
-
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_math_cdf.h"
-#include "util_vector.h"
+#include "render/camera.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/tables.h"
+
+#include "device/device.h"
+
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_math_cdf.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 141ef9cccef..dd6b831b347 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -17,13 +17,13 @@
 #ifndef __CAMERA_H__
 #define __CAMERA_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_boundbox.h"
-#include "util_transform.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -39,7 +39,7 @@ class Scene;
 
 class Camera : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	/* Specifies an offset for the shutter's time interval. */
 	enum MotionPosition {
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index b7f25663bc3..2569d9eec27 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "constant_fold.h"
-#include "graph.h"
+#include "render/constant_fold.h"
+#include "render/graph.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h
index 7962698319f..33f93b8c0ab 100644
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -17,8 +17,8 @@
 #ifndef __CONSTANT_FOLD_H__
 #define __CONSTANT_FOLD_H__
 
-#include "util_types.h"
-#include "svm_types.h"
+#include "util/util_types.h"
+#include "kernel/svm/svm_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index f671eb19cae..4c085b928fb 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "curves.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-
-#include "util_foreach.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
+#include "device/device.h"
+#include "render/curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h
index e41967eebf5..8834764bd63 100644
--- a/intern/cycles/render/curves.h
+++ b/intern/cycles/render/curves.h
@@ -17,8 +17,8 @@
 #ifndef __CURVES_H__
 #define __CURVES_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 923252bb375..7809f4345f1 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "device.h"
-#include "film.h"
-#include "integrator.h"
-#include "mesh.h"
-#include "scene.h"
-#include "tables.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_math.h"
-#include "util_math_cdf.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/mesh.h"
+#include "render/scene.h"
+#include "render/tables.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_math.h"
+#include "util/util_math_cdf.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 9fa51c51f52..83c941d5c57 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -17,12 +17,12 @@
 #ifndef __FILM_H__
 #define __FILM_H__
 
-#include "util_string.h"
-#include "util_vector.h"
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,7 +53,7 @@ public:
 
 class Film : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	float exposure;
 	array<Pass> passes;
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 52c94ec2716..8e61daab49b 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "attribute.h"
-#include "graph.h"
-#include "nodes.h"
-#include "shader.h"
-#include "constant_fold.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_queue.h"
-#include "util_logging.h"
+#include "render/attribute.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/constant_fold.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_queue.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -195,6 +196,7 @@ bool ShaderNode::equals(const ShaderNode& other)
 ShaderGraph::ShaderGraph()
 {
 	finalized = false;
+	simplified = false;
 	num_node_ids = 0;
 	add(new OutputNode());
 }
@@ -207,6 +209,8 @@ ShaderGraph::~ShaderGraph()
 ShaderNode *ShaderGraph::add(ShaderNode *node)
 {
 	assert(!finalized);
+	simplified = false;
+
 	node->id = num_node_ids++;
 	nodes.push_back(node);
 	return node;
@@ -241,6 +245,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to)
 {
 	assert(!finalized);
 	assert(from && to);
+	simplified = false;
 
 	if(to->link) {
 		fprintf(stderr, "Cycles shader graph connect: input already connected.\n");
@@ -273,6 +278,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to)
 void ShaderGraph::disconnect(ShaderOutput *from)
 {
 	assert(!finalized);
+	simplified = false;
 
 	foreach(ShaderInput *sock, from->links) {
 		sock->link = NULL;
@@ -285,6 +291,7 @@ void ShaderGraph::disconnect(ShaderInput *to)
 {
 	assert(!finalized);
 	assert(to->link);
+	simplified = false;
 
 	ShaderOutput *from = to->link;
 
@@ -294,6 +301,8 @@ void ShaderGraph::disconnect(ShaderInput *to)
 
 void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to)
 {
+	simplified = false;
+
 	/* Copy because disconnect modifies this list */
 	vector<ShaderInput*> outputs = from->links;
 
@@ -310,9 +319,19 @@ void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to)
 	}
 }
 
+void ShaderGraph::simplify(Scene *scene)
+{
+	if(!simplified) {
+		default_inputs(scene->shader_manager->use_osl());
+		clean(scene);
+		refine_bump_nodes();
+
+		simplified = true;
+	}
+}
+
 void ShaderGraph::finalize(Scene *scene,
                            bool do_bump,
-                           bool do_osl,
                            bool do_simplify,
                            bool bump_in_object_space)
 {
@@ -322,9 +341,7 @@ void ShaderGraph::finalize(Scene *scene,
 	 * modified afterwards. */
 
 	if(!finalized) {
-		default_inputs(do_osl);
-		clean(scene);
-		refine_bump_nodes();
+		simplify(scene);
 
 		if(do_bump)
 			bump_from_displacement(bump_in_object_space);
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 780fdf49ca4..09932695d1f 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -17,17 +17,17 @@
 #ifndef __GRAPH_H__
 #define __GRAPH_H__
 
-#include "node.h"
-#include "node_type.h"
+#include "graph/node.h"
+#include "graph/node_type.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_list.h"
-#include "util_map.h"
-#include "util_param.h"
-#include "util_set.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_set.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -201,14 +201,14 @@ public:
 /* Node definition utility macros */
 
 #define SHADER_NODE_CLASS(type) \
-	NODE_DECLARE; \
+	NODE_DECLARE \
 	type(); \
 	virtual ShaderNode *clone() const { return new type(*this); } \
 	virtual void compile(SVMCompiler& compiler); \
 	virtual void compile(OSLCompiler& compiler); \
 
 #define SHADER_NODE_NO_CLONE_CLASS(type) \
-	NODE_DECLARE; \
+	NODE_DECLARE \
 	type(); \
 	virtual void compile(SVMCompiler& compiler); \
 	virtual void compile(OSLCompiler& compiler); \
@@ -240,6 +240,7 @@ public:
 	list<ShaderNode*> nodes;
 	size_t num_node_ids;
 	bool finalized;
+	bool simplified;
 
 	ShaderGraph();
 	~ShaderGraph();
@@ -255,9 +256,9 @@ public:
 	void relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to);
 
 	void remove_proxy_nodes();
+	void simplify(Scene *scene);
 	void finalize(Scene *scene,
 	              bool do_bump = false,
-	              bool do_osl = false,
 	              bool do_simplify = false,
 	              bool bump_in_object_space = false);
 
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index fd8a1262208..a8c4f446bea 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "image.h"
-#include "scene.h"
+#include "device/device.h"
+#include "render/image.h"
+#include "render/scene.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_progress.h"
-#include "util_texture.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
+#include "util/util_texture.h"
 
 #ifdef WITH_OSL
 #include <OSL/oslexec.h>
@@ -156,6 +156,16 @@ ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filen
 		}
 	}
 
+	/* Perform preliminary checks, with meaningful logging. */
+	if(!path_exists(filename)) {
+		VLOG(1) << "File '" << filename << "' does not exist.";
+		return IMAGE_DATA_TYPE_BYTE4;
+	}
+	if(path_is_directory(filename)) {
+		VLOG(1) << "File '" << filename << "' is a directory, can't use as image.";
+		return IMAGE_DATA_TYPE_BYTE4;
+	}
+
 	ImageInput *in = ImageInput::create(filename);
 
 	if(in) {
@@ -285,9 +295,8 @@ int ImageManager::add_image(const string& filename,
 
 	thread_scoped_lock device_lock(device_mutex);
 
-	/* Do we have a float? */
-	if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
-		is_float = true;
+	/* Check whether it's a float texture. */
+	is_float = (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4);
 
 	/* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */
 	if((type == IMAGE_DATA_TYPE_FLOAT ||
@@ -433,6 +442,11 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid
 		return false;
 
 	if(!img->builtin_data) {
+		/* NOTE: Error logging is done in meta data acquisition. */
+		if(!path_exists(img->filename) || path_is_directory(img->filename)) {
+			return false;
+		}
+
 		/* load image from file through OIIO */
 		*in = ImageInput::create(img->filename);
 
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 494c74f0cdd..996b5a5b65f 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -17,13 +17,13 @@
 #ifndef __IMAGE_H__
 #define __IMAGE_H__
 
-#include "device.h"
-#include "device_memory.h"
+#include "device/device.h"
+#include "device/device_memory.h"
 
-#include "util_image.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_image.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 1ab0f9874f2..a004bb5b856 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "scene.h"
-#include "shader.h"
-#include "sobol.h"
-
-#include "util_foreach.h"
-#include "util_hash.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/sobol.h"
+
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 27fff4831e5..9501d7f8416 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -17,9 +17,9 @@
 #ifndef __INTEGRATOR_H__
 #define __INTEGRATOR_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -29,7 +29,7 @@ class Scene;
 
 class Integrator : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	int min_bounce;
 	int max_bounce;
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 2245c861d5a..4886dcd563f 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "util_foreach.h"
-#include "util_progress.h"
-#include "util_logging.h"
+#include "render/background.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -57,9 +57,9 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
+	device->mem_alloc("shade_background_pixels_input", d_input, MEM_READ_ONLY);
 	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+	device->mem_alloc("shade_background_pixels_output", d_output, MEM_WRITE_ONLY);
 
 	DeviceTask main_task(DeviceTask::SHADER);
 	main_task.shader_input = d_input.device_pointer;
@@ -486,10 +486,18 @@ static void background_cdf(int start,
                            float2 *cond_cdf)
 {
 	/* Conditional CDFs (rows, U direction). */
+	/* NOTE: It is possible to have some NaN pixels on background
+	 * which will ruin CDF causing wrong shading. We replace such
+	 * pixels with black.
+	 */
 	for(int i = start; i < end; i++) {
 		float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
 		float3 env_color = (*pixels)[i * res];
 		float ave_luminance = average(env_color);
+		/* TODO(sergey): Consider adding average_safe(). */
+		if(!isfinite(ave_luminance)) {
+			ave_luminance = 0.0f;
+		}
 
 		cond_cdf[i * cdf_count].x = ave_luminance * sin_theta;
 		cond_cdf[i * cdf_count].y = 0.0f;
@@ -497,6 +505,9 @@ static void background_cdf(int start,
 		for(int j = 1; j < res; j++) {
 			env_color = (*pixels)[i * res + j];
 			ave_luminance = average(env_color);
+			if(!isfinite(ave_luminance)) {
+				ave_luminance = 0.0f;
+			}
 
 			cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta;
 			cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index f56530b6490..7e9014eb823 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -17,12 +17,12 @@
 #ifndef __LIGHT_H__
 #define __LIGHT_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index c42b32919d4..a4dc06c4345 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -14,29 +14,29 @@
  * limitations under the License.
  */
 
-#include "bvh.h"
-#include "bvh_build.h"
-
-#include "camera.h"
-#include "curves.h"
-#include "device.h"
-#include "graph.h"
-#include "shader.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "scene.h"
-
-#include "osl_globals.h"
-
-#include "subd_split.h"
-#include "subd_patch_table.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_set.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+
+#include "render/camera.h"
+#include "render/curves.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/shader.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "kernel/osl/osl_globals.h"
+
+#include "subd/subd_split.h"
+#include "subd/subd_patch_table.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_set.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -1873,9 +1873,14 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 		dscene->prim_object.reference((uint*)&pack.prim_object[0], pack.prim_object.size());
 		device->tex_alloc("__prim_object", dscene->prim_object);
 	}
+	if(pack.prim_time.size()) {
+		dscene->prim_time.reference((float2*)&pack.prim_time[0], pack.prim_time.size());
+		device->tex_alloc("__prim_time", dscene->prim_time);
+	}
 
 	dscene->data.bvh.root = pack.root_index;
 	dscene->data.bvh.use_qbvh = scene->params.use_qbvh;
+	dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
 }
 
 void MeshManager::device_update_flags(Device * /*device*/,
@@ -2152,6 +2157,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	device->tex_free(dscene->prim_visibility);
 	device->tex_free(dscene->prim_index);
 	device->tex_free(dscene->prim_object);
+	device->tex_free(dscene->prim_time);
 	device->tex_free(dscene->tri_shader);
 	device->tex_free(dscene->tri_vnormal);
 	device->tex_free(dscene->tri_vindex);
@@ -2173,6 +2179,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->prim_visibility.clear();
 	dscene->prim_index.clear();
 	dscene->prim_object.clear();
+	dscene->prim_time.clear();
 	dscene->tri_shader.clear();
 	dscene->tri_vnormal.clear();
 	dscene->tri_vindex.clear();
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 5f33e30eac2..043ce9d0ffc 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -17,17 +17,18 @@
 #ifndef __MESH_H__
 #define __MESH_H__
 
-#include "attribute.h"
-#include "node.h"
-#include "shader.h"
-
-#include "util_boundbox.h"
-#include "util_list.h"
-#include "util_map.h"
-#include "util_param.h"
-#include "util_transform.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "graph/node.h"
+
+#include "render/attribute.h"
+#include "render/shader.h"
+
+#include "util/util_boundbox.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,7 +49,7 @@ struct PackedPatchTable;
 
 class Mesh : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	/* Mesh Triangle */
 	struct Triangle {
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index adc5b820298..cf28bb16bb7 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "device/device.h"
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
 
-#include "util_foreach.h"
-#include "util_progress.h"
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -121,9 +121,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	/* needs to be up to data for attribute access */
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
+	device->mem_alloc("displace_input", d_input, MEM_READ_ONLY);
 	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+	device->mem_alloc("displace_output", d_output, MEM_WRITE_ONLY);
 
 	DeviceTask task(DeviceTask::SHADER);
 	task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp
index 57c76a9f1c8..585ed77b026 100644
--- a/intern/cycles/render/mesh_subdivision.cpp
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "attribute.h"
-#include "camera.h"
+#include "render/mesh.h"
+#include "render/attribute.h"
+#include "render/camera.h"
 
-#include "subd_split.h"
-#include "subd_patch.h"
-#include "subd_patch_table.h"
+#include "subd/subd_split.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_patch_table.h"
 
-#include "util_foreach.h"
-#include "util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_algorithm.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index c02c1adb989..3f56690d0c1 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#include "image.h"
-#include "integrator.h"
-#include "nodes.h"
-#include "scene.h"
-#include "svm.h"
-#include "svm_color_util.h"
-#include "svm_ramp_util.h"
-#include "svm_math_util.h"
-#include "osl.h"
-#include "constant_fold.h"
-
-#include "util_sky_model.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "render/image.h"
+#include "render/integrator.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/svm.h"
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_ramp_util.h"
+#include "kernel/svm/svm_math_util.h"
+#include "render/osl.h"
+#include "render/constant_fold.h"
+
+#include "util/util_sky_model.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -1931,21 +1932,38 @@ GlossyBsdfNode::GlossyBsdfNode()
 void GlossyBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp glossy BSDF.";
 			distribution = CLOSURE_BSDF_REFLECTION_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_REFLECTION_ID)
+		{
+			VLOG(1) << "Using GGX glossy with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -1953,7 +1971,8 @@ void GlossyBsdfNode::simplify_settings(Scene *scene)
 bool GlossyBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_REFLECTION_ID || roughness <= 1e-4f);
 }
 
 void GlossyBsdfNode::compile(SVMCompiler& compiler)
@@ -2008,21 +2027,38 @@ GlassBsdfNode::GlassBsdfNode()
 void GlassBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp glass BSDF.";
 			distribution = CLOSURE_BSDF_SHARP_GLASS_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_SHARP_GLASS_ID)
+		{
+			VLOG(1) << "Using GGX glass with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -2030,7 +2066,8 @@ void GlassBsdfNode::simplify_settings(Scene *scene)
 bool GlassBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_SHARP_GLASS_ID || roughness <= 1e-4f);
 }
 
 void GlassBsdfNode::compile(SVMCompiler& compiler)
@@ -2085,21 +2122,38 @@ RefractionBsdfNode::RefractionBsdfNode()
 void RefractionBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp refraction BSDF.";
 			distribution = CLOSURE_BSDF_REFRACTION_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_REFRACTION_ID)
+		{
+			VLOG(1) << "Using GGX refraction with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -2107,7 +2161,8 @@ void RefractionBsdfNode::simplify_settings(Scene *scene)
 bool RefractionBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_REFRACTION_ID || roughness <= 1e-4f);
 }
 
 void RefractionBsdfNode::compile(SVMCompiler& compiler)
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 8d2df673688..d8023747860 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -17,10 +17,10 @@
 #ifndef __NODES_H__
 #define __NODES_H__
 
-#include "graph.h"
-#include "node.h"
+#include "render/graph.h"
+#include "graph/node.h"
 
-#include "util_string.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -324,7 +324,7 @@ private:
 class BsdfNode : public ShaderNode {
 public:
 	explicit BsdfNode(const NodeType *node_type);
-	SHADER_NODE_BASE_CLASS(BsdfNode);
+	SHADER_NODE_BASE_CLASS(BsdfNode)
 
 	bool has_spatial_varying() { return true; }
 	void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL);
@@ -421,7 +421,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness;
+	float roughness, roughness_orig;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -433,7 +433,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness, IOR;
+	float roughness, roughness_orig, IOR;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -445,7 +445,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness, IOR;
+	float roughness, roughness_orig, IOR;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -674,7 +674,7 @@ public:
 
 class MixClosureWeightNode : public ShaderNode {
 public:
-	SHADER_NODE_CLASS(MixClosureWeightNode);
+	SHADER_NODE_CLASS(MixClosureWeightNode)
 
 	float weight;
 	float fac;
@@ -920,7 +920,7 @@ public:
 class CurvesNode : public ShaderNode {
 public:
 	explicit CurvesNode(const NodeType *node_type);
-	SHADER_NODE_BASE_CLASS(CurvesNode);
+	SHADER_NODE_BASE_CLASS(CurvesNode)
 
 	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 8342f376836..375abfeb27a 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -14,22 +14,22 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "device.h"
-#include "light.h"
-#include "mesh.h"
-#include "curves.h"
-#include "object.h"
-#include "particles.h"
-#include "scene.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
-
-#include "subd_patch_table.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/curves.h"
+#include "render/object.h"
+#include "render/particles.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
+
+#include "subd/subd_patch_table.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -49,6 +49,8 @@ NODE_DEFINE(Object)
 	SOCKET_POINT(dupli_generated, "Dupli Generated", make_float3(0.0f, 0.0f, 0.0f));
 	SOCKET_POINT2(dupli_uv, "Dupli UV", make_float2(0.0f, 0.0f));
 
+	SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", false);
+
 	return type;
 }
 
@@ -597,6 +599,12 @@ void ObjectManager::device_update_flags(Device *device,
 		else {
 			object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME;
 		}
+		if(object->is_shadow_catcher) {
+			object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER;
+		}
+		else {
+			object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER;
+		}
 
 		if(bounds_valid) {
 			foreach(Object *volume_object, volume_objects) {
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 7e306fab2a8..12d7b2c81cf 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -17,14 +17,14 @@
 #ifndef __OBJECT_H__
 #define __OBJECT_H__
 
-#include "node.h"
-#include "scene.h"
+#include "graph/node.h"
+#include "render/scene.h"
 
-#include "util_boundbox.h"
-#include "util_param.h"
-#include "util_transform.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -40,7 +40,7 @@ struct Transform;
 
 class Object : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	Mesh *mesh;
 	Transform tfm;
@@ -53,6 +53,7 @@ public:
 	bool use_motion;
 	bool hide_on_missing_motion;
 	bool use_holdout;
+	bool is_shadow_catcher;
 
 	float3 dupli_generated;
 	float2 dupli_uv;
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 67b68e63cb2..6bff29d1c76 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -14,26 +14,26 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "device/device.h"
 
-#include "graph.h"
-#include "light.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-#include "nodes.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/nodes.h"
 
 #ifdef WITH_OSL
 
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_progress.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
 
 #endif
 
@@ -1096,12 +1096,10 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
 		/* finalize */
 		shader->graph->finalize(scene,
 		                        false,
-		                        true,
 		                        shader->has_integrator_dependency);
 		if(shader->graph_bump) {
 			shader->graph_bump->finalize(scene,
 			                             true,
-			                             true,
 			                             shader->has_integrator_dependency,
 			                             shader->displacement_method == DISPLACE_BOTH);
 		}
diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h
index b131b672b8c..2be1126fdd3 100644
--- a/intern/cycles/render/osl.h
+++ b/intern/cycles/render/osl.h
@@ -17,13 +17,13 @@
 #ifndef __OSL_H__
 #define __OSL_H__
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_thread.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
 
-#include "graph.h"
-#include "nodes.h"
-#include "shader.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/shader.h"
 
 #ifdef WITH_OSL
 #include <OSL/oslcomp.h>
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index 1a35d60fb4b..a51822a08be 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "particles.h"
-#include "scene.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
+#include "device/device.h"
+#include "render/particles.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/particles.h b/intern/cycles/render/particles.h
index 2509e27b44b..66d46114b3e 100644
--- a/intern/cycles/render/particles.h
+++ b/intern/cycles/render/particles.h
@@ -17,8 +17,8 @@
 #ifndef __PARTICLES_H__
 #define __PARTICLES_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 68124e78cb5..4db20338744 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -16,27 +16,27 @@
 
 #include <stdlib.h>
 
-#include "background.h"
-#include "bake.h"
-#include "camera.h"
-#include "curves.h"
-#include "device.h"
-#include "film.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "osl.h"
-#include "particles.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-#include "tables.h"
-
-#include "util_foreach.h"
-#include "util_guarded_allocator.h"
-#include "util_logging.h"
-#include "util_progress.h"
+#include "render/background.h"
+#include "render/bake.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "device/device.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/particles.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+#include "render/tables.h"
+
+#include "util/util_foreach.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 8768682043f..2b5267642a2 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -17,18 +17,18 @@
 #ifndef __SCENE_H__
 #define __SCENE_H__
 
-#include "image.h"
-#include "shader.h"
+#include "render/image.h"
+#include "render/shader.h"
 
-#include "device_memory.h"
+#include "device/device_memory.h"
 
-#include "util_param.h"
-#include "util_string.h"
-#include "util_system.h"
-#include "util_texture.h"
-#include "util_thread.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
+#include "util/util_texture.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -69,6 +69,7 @@ public:
 	device_vector<uint> prim_visibility;
 	device_vector<uint> prim_index;
 	device_vector<uint> prim_object;
+	device_vector<float2> prim_time;
 
 	/* mesh */
 	device_vector<uint> tri_shader;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 7c01934cfd8..c9b5547b407 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -17,24 +17,24 @@
 #include <string.h>
 #include <limits.h>
 
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "graph.h"
-#include "integrator.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "session.h"
-#include "bake.h"
-
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_task.h"
-#include "util_time.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/bake.h"
+
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_task.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -230,7 +230,9 @@ void Session::run_gpu()
 				while(1) {
 					scoped_timer pause_timer;
 					pause_cond.wait(pause_lock);
-					progress.add_skip_time(pause_timer, params.background);
+					if(pause) {
+						progress.add_skip_time(pause_timer, params.background);
+					}
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -520,7 +522,9 @@ void Session::run_cpu()
 				while(1) {
 					scoped_timer pause_timer;
 					pause_cond.wait(pause_lock);
-					progress.add_skip_time(pause_timer, params.background);
+					if(pause) {
+						progress.add_skip_time(pause_timer, params.background);
+					}
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -633,6 +637,9 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 			requested_features.use_patch_evaluation = true;
 		}
 #endif
+		if(object->is_shadow_catcher) {
+			requested_features.use_shadow_tricks = true;
+		}
 	}
 
 	BakeManager *bake_manager = scene->bake_manager;
@@ -650,6 +657,8 @@ void Session::load_kernels()
 	if(!kernels_loaded) {
 		progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
+		scoped_timer timer;
+
 		DeviceRequestedFeatures requested_features = get_requested_device_features();
 		VLOG(2) << "Requested features:\n" << requested_features;
 		if(!device->load_kernels(requested_features)) {
@@ -663,6 +672,9 @@ void Session::load_kernels()
 			return;
 		}
 
+		progress.add_skip_time(timer, false);
+		VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start();
+
 		kernels_loaded = true;
 	}
 }
@@ -824,7 +836,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	int progressive_sample = tile_manager.state.sample;
 	int num_samples = tile_manager.get_num_effective_samples();
 
-	int tile = tile_manager.state.num_rendered_tiles;
+	int tile = progress.get_finished_tiles();
 	int num_tiles = tile_manager.state.num_tiles;
 
 	/* update status */
@@ -832,7 +844,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 
 	if(!params.progressive) {
 		const bool is_cpu = params.device.type == DEVICE_CPU;
-		const bool is_last_tile = (progress.get_finished_tiles() + 1) == num_tiles;
+		const bool is_last_tile = (tile + 1) == num_tiles;
 
 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
 
@@ -883,6 +895,7 @@ void Session::path_trace()
 	task.need_finish_queue = params.progressive_refine;
 	task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
 	task.requested_tile_size = params.tile_size;
+	task.passes_size = tile_manager.params.get_passes_size();
 
 	device->task_add(task);
 }
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index c7ff1446171..a7e5f78a64d 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -17,15 +17,15 @@
 #ifndef __SESSION_H__
 #define __SESSION_H__
 
-#include "buffers.h"
-#include "device.h"
-#include "shader.h"
-#include "tile.h"
-
-#include "util_progress.h"
-#include "util_stats.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "render/buffers.h"
+#include "device/device.h"
+#include "render/shader.h"
+#include "render/tile.h"
+
+#include "util/util_progress.h"
+#include "util/util_stats.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 335edcbe609..12d3c6cf832 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -14,22 +14,22 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "camera.h"
-#include "device.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-#include "tables.h"
-
-#include "util_foreach.h"
+#include "render/background.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+#include "render/tables.h"
+
+#include "util/util_foreach.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 7d896652196..87fef19c592 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -19,20 +19,20 @@
 
 #ifdef WITH_OSL
 /* So no context pollution happens from indirectly included windows.h */
-#  include "util_windows.h"
+#  include "util/util_windows.h"
 #  include <OSL/oslexec.h>
 #endif
 
-#include "attribute.h"
-#include "kernel_types.h"
+#include "render/attribute.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -82,7 +82,7 @@ enum DisplacementMethod {
 
 class Shader : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	int pass_id;
 
diff --git a/intern/cycles/render/sobol.cpp b/intern/cycles/render/sobol.cpp
index e3c2e802067..ce93dc8c5d5 100644
--- a/intern/cycles/render/sobol.cpp
+++ b/intern/cycles/render/sobol.cpp
@@ -46,10 +46,10 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
-#include "sobol.h"
+#include "render/sobol.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/sobol.h b/intern/cycles/render/sobol.h
index 574f148b9a2..9fbce4e14a5 100644
--- a/intern/cycles/render/sobol.h
+++ b/intern/cycles/render/sobol.h
@@ -17,7 +17,7 @@
 #ifndef __SOBOL_H__
 #define __SOBOL_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 955b892f4c3..4cb4018e2b4 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -14,20 +14,20 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "graph.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_foreach.h"
-#include "util_progress.h"
-#include "util_task.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -813,7 +813,6 @@ void SVMCompiler::compile(Scene *scene,
 		scoped_timer timer((summary != NULL)? &summary->time_finalize: NULL);
 		shader->graph->finalize(scene,
 		                        false,
-		                        false,
 		                        shader->has_integrator_dependency);
 	}
 
@@ -821,7 +820,6 @@ void SVMCompiler::compile(Scene *scene,
 		scoped_timer timer((summary != NULL)? &summary->time_finalize_bump: NULL);
 		shader->graph_bump->finalize(scene,
 		                             true,
-		                             false,
 		                             shader->has_integrator_dependency,
 		                             shader->displacement_method == DISPLACE_BOTH);
 	}
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index a501b6bc8b1..abbd9e50610 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -17,13 +17,13 @@
 #ifndef __SVM_H__
 #define __SVM_H__
 
-#include "attribute.h"
-#include "graph.h"
-#include "shader.h"
+#include "render/attribute.h"
+#include "render/graph.h"
+#include "render/shader.h"
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_thread.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index dfafd99961b..bf1ef12d602 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "scene.h"
-#include "tables.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/tables.h"
 
-#include "util_debug.h"
-#include "util_logging.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h
index 1bb70b22762..bc261c2a74d 100644
--- a/intern/cycles/render/tables.h
+++ b/intern/cycles/render/tables.h
@@ -17,7 +17,7 @@
 #ifndef __TABLES_H__
 #define __TABLES_H__
 
-#include <util_list.h>
+#include "util/util_list.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index a493c3fa1cd..944e746ca2d 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "tile.h"
+#include "render/tile.h"
 
-#include "util_algorithm.h"
-#include "util_types.h"
+#include "util/util_algorithm.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -131,7 +131,6 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
 	state.buffer = BufferParams();
 	state.sample = range_start_sample - 1;
 	state.num_tiles = 0;
-	state.num_rendered_tiles = 0;
 	state.num_samples = 0;
 	state.resolution_divider = get_divider(params.width, params.height, start_resolution);
 	state.tiles.clear();
@@ -343,7 +342,6 @@ bool TileManager::next_tile(Tile& tile, int device)
 
 	tile = Tile(state.tiles[logical_device].front());
 	state.tiles[logical_device].pop_front();
-	state.num_rendered_tiles++;
 	return true;
 }
 
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 5d92ebac355..622b89f7670 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -19,8 +19,8 @@
 
 #include <limits.h>
 
-#include "buffers.h"
-#include "util_list.h"
+#include "render/buffers.h"
+#include "util/util_list.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -63,7 +63,6 @@ public:
 		int num_samples;
 		int resolution_divider;
 		int num_tiles;
-		int num_rendered_tiles;
 
 		/* Total samples over all pixels: Generally num_samples*num_pixels,
 		 * but can be higher due to the initial resolution division for previews. */
diff --git a/intern/cycles/subd/CMakeLists.txt b/intern/cycles/subd/CMakeLists.txt
index dafb807bdf3..fe0c221ab0d 100644
--- a/intern/cycles/subd/CMakeLists.txt
+++ b/intern/cycles/subd/CMakeLists.txt
@@ -1,11 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../util
+	..
 )
 
 set(INC_SYS
diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp
index a1bd349b167..fae815901ee 100644
--- a/intern/cycles/subd/subd_dice.cpp
+++ b/intern/cycles/subd/subd_dice.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
+#include "render/camera.h"
+#include "render/mesh.h"
 
-#include "subd_dice.h"
-#include "subd_patch.h"
+#include "subd/subd_dice.h"
+#include "subd/subd_patch.h"
 
-#include "util_debug.h"
+#include "util/util_debug.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h
index 33d13a4ab3a..c0e32be18c4 100644
--- a/intern/cycles/subd/subd_dice.h
+++ b/intern/cycles/subd/subd_dice.h
@@ -22,8 +22,8 @@
  * DiagSplit. For more algorithm details, see the DiagSplit paper or the
  * ARB_tessellation_shader OpenGL extension, Section 2.X.2. */
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch.cpp b/intern/cycles/subd/subd_patch.cpp
index d3319c5ccf5..fa2fe2bf113 100644
--- a/intern/cycles/subd/subd_patch.cpp
+++ b/intern/cycles/subd/subd_patch.cpp
@@ -16,12 +16,12 @@
 
 /* Parts adapted from code in the public domain in NVidia Mesh Tools. */
 
-#include "mesh.h"
+#include "render/mesh.h"
 
-#include "subd_patch.h"
+#include "subd/subd_patch.h"
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h
index 360c1abf27b..1bb81588835 100644
--- a/intern/cycles/subd/subd_patch.h
+++ b/intern/cycles/subd/subd_patch.h
@@ -17,8 +17,8 @@
 #ifndef __SUBD_PATCH_H__
 #define __SUBD_PATCH_H__
 
-#include "util_boundbox.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp
index d437b045c07..63bf673a90b 100644
--- a/intern/cycles/subd/subd_patch_table.cpp
+++ b/intern/cycles/subd/subd_patch_table.cpp
@@ -25,10 +25,10 @@
  *
  */
 
-#include "subd_patch_table.h"
-#include "kernel_types.h"
+#include "subd/subd_patch_table.h"
+#include "kernel/kernel_types.h"
 
-#include "util_math.h"
+#include "util/util_math.h"
 
 #ifdef WITH_OPENSUBDIV
 #include <opensubdiv/far/patchTable.h>
diff --git a/intern/cycles/subd/subd_patch_table.h b/intern/cycles/subd/subd_patch_table.h
index 3166a1691d8..907f2dd6c28 100644
--- a/intern/cycles/subd/subd_patch_table.h
+++ b/intern/cycles/subd/subd_patch_table.h
@@ -17,8 +17,8 @@
 #ifndef __SUBD_PATCH_TABLE_H__
 #define __SUBD_PATCH_TABLE_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 #ifdef WITH_OPENSUBDIV
 #ifdef _MSC_VER
diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp
index 3c91ad8ab0d..9dbfc1c4e2f 100644
--- a/intern/cycles/subd/subd_split.cpp
+++ b/intern/cycles/subd/subd_split.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
+#include "render/camera.h"
+#include "render/mesh.h"
 
-#include "subd_dice.h"
-#include "subd_patch.h"
-#include "subd_split.h"
+#include "subd/subd_dice.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
 
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h
index a2f76dd2e03..f869cc6a48e 100644
--- a/intern/cycles/subd/subd_split.h
+++ b/intern/cycles/subd/subd_split.h
@@ -22,10 +22,10 @@
  * evaluation at arbitrary points is required for this to work. See the paper
  * for more details. */
 
-#include "subd_dice.h"
+#include "subd/subd_dice.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index d8abf671bd6..a015fef8284 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 set(INC
-	.
+	..
 	../../glew-mx
 )
 
@@ -52,6 +52,7 @@ set(SRC_HEADERS
 	util_math.h
 	util_math_cdf.h
 	util_math_fast.h
+	util_math_intersect.h
 	util_md5.h
 	util_opengl.h
 	util_optimization.h
diff --git a/intern/cycles/util/util_aligned_malloc.cpp b/intern/cycles/util/util_aligned_malloc.cpp
index 15d2eb3271b..cc7252dcc58 100644
--- a/intern/cycles/util/util_aligned_malloc.cpp
+++ b/intern/cycles/util/util_aligned_malloc.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "util_aligned_malloc.h"
-#include "util_guarded_allocator.h"
+#include "util/util_aligned_malloc.h"
+#include "util/util_guarded_allocator.h"
 
 #include <cassert>
 
diff --git a/intern/cycles/util/util_aligned_malloc.h b/intern/cycles/util/util_aligned_malloc.h
index ecc0f28c376..cf1e86ca916 100644
--- a/intern/cycles/util/util_aligned_malloc.h
+++ b/intern/cycles/util/util_aligned_malloc.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_ALIGNED_MALLOC_H__
 #define __UTIL_ALIGNED_MALLOC_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 433e41fbbb6..6c52117ef9a 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -32,6 +32,13 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 	}
 }
 
+#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
+
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE 0
+#define ccl_barrier(flags) (void)0
+
 #else  /* __KERNEL_GPU__ */
 
 #ifdef __KERNEL_OPENCL__
@@ -39,7 +46,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 /* Float atomics implementation credits:
  *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
  */
-ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source,
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
                                         const float operand)
 {
 	union {
@@ -56,10 +63,29 @@ ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *sou
 	} while(atomic_cmpxchg((volatile ccl_global unsigned int *)source,
 	                       prev_value.int_value,
 	                       new_value.int_value) != prev_value.int_value);
+	return new_value.float_value;
 }
 
+#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
+#define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+
+#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) barrier(flags)
+
 #endif  /* __KERNEL_OPENCL__ */
 
+#ifdef __KERNEL_CUDA__
+
+#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
+
+#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) __syncthreads()
+
+#endif  /* __KERNEL_CUDA__ */
+
 #endif  /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index dfe4977aef3..ed94ca20211 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -20,10 +20,10 @@
 #include <math.h>
 #include <float.h>
 
-#include "util_math.h"
-#include "util_string.h"
-#include "util_transform.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_string.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index d3598f84b94..4d673dc34d8 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -17,11 +17,11 @@
 #ifndef __UTIL_COLOR_H__
 #define __UTIL_COLOR_H__
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 #ifdef __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 80d177d2cae..9cfa57dd741 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
+#include "util/util_debug.h"
 
 #include <stdlib.h>
 
-#include "util_logging.h"
-#include "util_string.h"
+#include "util/util_logging.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -29,7 +29,8 @@ DebugFlags::CPU::CPU()
     sse41(true),
     sse3(true),
     sse2(true),
-    qbvh(true)
+    qbvh(true),
+    split_kernel(false)
 {
 	reset();
 }
@@ -55,10 +56,12 @@ void DebugFlags::CPU::reset()
 #undef CHECK_CPU_FLAGS
 
 	qbvh = true;
+	split_kernel = false;
 }
 
 DebugFlags::CUDA::CUDA()
-  : adaptive_compile(false)
+  : adaptive_compile(false),
+    split_kernel(false)
 {
 	reset();
 }
@@ -67,12 +70,15 @@ void DebugFlags::CUDA::reset()
 {
 	if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
 		adaptive_compile = true;
+
+	split_kernel = false;
 }
 
 DebugFlags::OpenCL::OpenCL()
   : device_type(DebugFlags::OpenCL::DEVICE_ALL),
     kernel_type(DebugFlags::OpenCL::KERNEL_DEFAULT),
-    debug(false)
+    debug(false),
+    single_program(false)
 {
 	reset();
 }
@@ -112,6 +118,7 @@ void DebugFlags::OpenCL::reset()
 	}
 	/* Initialize other flags from environment variables. */
 	debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
+	single_program = (getenv("CYCLES_OPENCL_SINGLE_PROGRAM") != NULL);
 }
 
 DebugFlags::DebugFlags()
@@ -133,7 +140,9 @@ std::ostream& operator <<(std::ostream &os,
 	   << "  AVX    : " << string_from_bool(debug_flags.cpu.avx)   << "\n"
 	   << "  SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
 	   << "  SSE3   : " << string_from_bool(debug_flags.cpu.sse3)  << "\n"
-	   << "  SSE2   : " << string_from_bool(debug_flags.cpu.sse2)  << "\n";
+	   << "  SSE2   : " << string_from_bool(debug_flags.cpu.sse2)  << "\n"
+	   << "  QBVH   : " << string_from_bool(debug_flags.cpu.qbvh)  << "\n"
+	   << "  Split  : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
 
 	os << "CUDA flags:\n"
 	   << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
@@ -172,9 +181,10 @@ std::ostream& operator <<(std::ostream &os,
 			break;
 	}
 	os << "OpenCL flags:\n"
-	   << "  Device type : " << opencl_device_type << "\n"
-	   << "  Kernel type : " << opencl_kernel_type << "\n"
-	   << "  Debug       : " << string_from_bool(debug_flags.opencl.debug)
+	   << "  Device type    : " << opencl_device_type << "\n"
+	   << "  Kernel type    : " << opencl_kernel_type << "\n"
+	   << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
+	   << "  Signle program : " << string_from_bool(debug_flags.opencl.single_program)
 	   << "\n";
 	return os;
 }
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 73fd228b5d9..4505d584490 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -20,7 +20,7 @@
 #include <cassert>
 #include <iostream>
 
-#include "util_static_assert.h"
+#include "util/util_static_assert.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -46,6 +46,9 @@ public:
 
 		/* Whether QBVH usage is allowed or not. */
 		bool qbvh;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};
 
 	/* Descriptor of CUDA feature-set to be used. */
@@ -58,6 +61,9 @@ public:
 		/* Whether adaptive feature based runtime compile is enabled or not.
 		 * Requires the CUDA Toolkit and only works on Linux atm. */
 		bool adaptive_compile;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};
 
 	/* Descriptor of OpenCL feature-set to be used. */
@@ -106,6 +112,9 @@ public:
 
 		/* Use debug version of the kernel. */
 		bool debug;
+
+		/* Use single program */
+		bool single_program;
 	};
 
 	/* Get instance of debug flags registry. */
diff --git a/intern/cycles/util/util_guarded_allocator.cpp b/intern/cycles/util/util_guarded_allocator.cpp
index 615ac95f324..54fa6a80df5 100644
--- a/intern/cycles/util/util_guarded_allocator.cpp
+++ b/intern/cycles/util/util_guarded_allocator.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "util_guarded_allocator.h"
-#include "util_stats.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_stats.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index 78453d214be..5f9dcfb2481 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -20,8 +20,8 @@
 #include <cstddef>
 #include <memory>
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
 #ifdef WITH_BLENDER_GUARDEDALLOC
 #  include "../../guardedalloc/MEM_guardedalloc.h"
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 5db3384cda4..612228dd1c1 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -17,10 +17,11 @@
 #ifndef __UTIL_HALF_H__
 #define __UTIL_HALF_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
+#include "util/util_math.h"
 
 #ifdef __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -110,6 +111,28 @@ ccl_device_inline float4 half4_to_float4(half4 h)
 	return f;
 }
 
+ccl_device_inline half float_to_half(float f)
+{
+	const uint u = __float_as_uint(f);
+	/* Sign bit, shifted to it's position. */
+	uint sign_bit = u & 0x80000000;
+	sign_bit >>= 16;
+	/* Exponent. */
+	uint exponent_bits = u & 0x7f800000;
+	/* Non-sign bits. */
+	uint value_bits = u & 0x7fffffff;
+	value_bits >>= 13;  /* Align mantissa on MSB. */
+	value_bits -= 0x1c000;  /* Adjust bias. */
+	/* Flush-to-zero. */
+	value_bits = (exponent_bits < 0x38800000) ? 0 : value_bits;
+	/* Clamp-to-max. */
+	value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits;
+	/* Denormals-as-zero. */
+	value_bits = (exponent_bits == 0 ? 0 : value_bits);
+	/* Re-insert sign bit and return. */
+	return (value_bits | sign_bit);
+}
+
 #endif
 
 #endif
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index 98c3a681ff2..a30b7fe288e 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_HASH_H__
 #define __UTIL_HASH_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h
index c8efc551d97..18876841b5b 100644
--- a/intern/cycles/util/util_image.h
+++ b/intern/cycles/util/util_image.h
@@ -21,7 +21,7 @@
 
 #include <OpenImageIO/imageio.h>
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -42,4 +42,4 @@ CCL_NAMESPACE_END
 
 #endif /* __UTIL_IMAGE_H__ */
 
-#include "util_image_impl.h"
+#include "util/util_image_impl.h"
diff --git a/intern/cycles/util/util_image_impl.h b/intern/cycles/util/util_image_impl.h
index 73ecfda0855..a0f9c66f979 100644
--- a/intern/cycles/util/util_image_impl.h
+++ b/intern/cycles/util/util_image_impl.h
@@ -17,9 +17,10 @@
 #ifndef __UTIL_IMAGE_IMPL_H__
 #define __UTIL_IMAGE_IMPL_H__
 
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_image.h"
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_half.h"
+#include "util/util_image.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,6 +39,52 @@ const T *util_image_read(const vector<T>& pixels,
 	return &pixels[index];
 }
 
+/* Cast input pixel from unknown storage to float. */
+template<typename T>
+inline float cast_to_float(T value);
+
+template<>
+inline float cast_to_float(float value)
+{
+	return value;
+}
+template<>
+inline float cast_to_float(uchar value)
+{
+	return (float)value / 255.0f;
+}
+template<>
+inline float cast_to_float(half value)
+{
+	return half_to_float(value);
+}
+
+/* Cast float value to output pixel type. */
+template<typename T>
+inline T cast_from_float(float value);
+
+template<>
+inline float cast_from_float(float value)
+{
+	return value;
+}
+template<>
+inline uchar cast_from_float(float value)
+{
+	if(value < 0.0f) {
+		return 0;
+	}
+	else if(value > (1.0f - 0.5f / 255.0f)) {
+		return 255;
+	}
+	return (uchar)((255.0f * value) + 0.5f);
+}
+template<>
+inline half cast_from_float(float value)
+{
+	return float_to_half(value);
+}
+
 template<typename T>
 void util_image_downscale_sample(const vector<T>& pixels,
                                  const size_t width,
@@ -71,15 +118,22 @@ void util_image_downscale_sample(const vector<T>& pixels,
 				                                 components,
 				                                 nx, ny, nz);
 				for(size_t k = 0; k < components; ++k) {
-					accum[k] += pixel[k];
+					accum[k] += cast_to_float(pixel[k]);
 				}
 				++count;
 			}
 		}
 	}
-	const float inv_count = 1.0f / (float)count;
-	for(size_t k = 0; k < components; ++k) {
-		result[k] = T(accum[k] * inv_count);
+	if(count != 0) {
+		const float inv_count = 1.0f / (float)count;
+		for(size_t k = 0; k < components; ++k) {
+			result[k] = cast_from_float<T>(accum[k] * inv_count);
+		}
+	}
+	else {
+		for(size_t k = 0; k < components; ++k) {
+			result[k] = T(0.0f);
+		}
 	}
 }
 
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index 03041723e15..a5a3bd34fff 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <util_logging.h>
+#include "util/util_logging.h"
 
-#include "util_math.h"
+#include "util/util_math.h"
 
 #include <stdio.h>
 #ifdef _MSC_VER
@@ -69,6 +69,15 @@ void util_logging_verbosity_set(int verbosity)
 }
 
 std::ostream& operator <<(std::ostream &os,
+                          const int2 &value)
+{
+	os << "(" << value.x
+	   << ", " << value.y
+	   << ")";
+	return os;
+}
+
+std::ostream& operator <<(std::ostream &os,
                           const float3 &value)
 {
 	os << "(" << value.x
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 2aa9c25b1a0..ecf9c9cfee0 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -45,6 +45,7 @@ public:
 
 #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
 
+struct int2;
 struct float3;
 
 void util_logging_init(const char *argv0);
@@ -52,6 +53,8 @@ void util_logging_start(void);
 void util_logging_verbosity_set(int verbosity);
 
 std::ostream& operator <<(std::ostream &os,
+                          const int2 &value);
+std::ostream& operator <<(std::ostream &os,
                           const float3 &value);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 2b81c8c498a..e0305b978b9 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -35,7 +35,7 @@
 
 #endif
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,41 +43,41 @@ CCL_NAMESPACE_BEGIN
 
 /* Division */
 #ifndef M_PI_F
-#define M_PI_F		((float)3.14159265358979323846264338327950288) 		/* pi */
+#define M_PI_F    (3.1415926535897932f)  /* pi */
 #endif
 #ifndef M_PI_2_F
-#define M_PI_2_F	((float)1.57079632679489661923132169163975144) 		/* pi/2 */
+#define M_PI_2_F  (1.5707963267948966f)  /* pi/2 */
 #endif
 #ifndef M_PI_4_F
-#define M_PI_4_F	((float)0.785398163397448309615660845819875721) 	/* pi/4 */
+#define M_PI_4_F  (0.7853981633974830f)  /* pi/4 */
 #endif
 #ifndef M_1_PI_F
-#define M_1_PI_F	((float)0.318309886183790671537767526745028724) 	/* 1/pi */
+#define M_1_PI_F  (0.3183098861837067f)  /* 1/pi */
 #endif
 #ifndef M_2_PI_F
-#define M_2_PI_F	((float)0.636619772367581343075535053490057448) 	/* 2/pi */
+#define M_2_PI_F  (0.6366197723675813f)  /* 2/pi */
 #endif
 
 /* Multiplication */
 #ifndef M_2PI_F
-#define M_2PI_F		((float)6.283185307179586476925286766559005768)		/* 2*pi */
+#define M_2PI_F   (6.2831853071795864f)  /* 2*pi */
 #endif
 #ifndef M_4PI_F
-#define M_4PI_F		((float)12.56637061435917295385057353311801153)		/* 4*pi */
+#define M_4PI_F   (12.566370614359172f)  /* 4*pi */
 #endif
 
 /* Float sqrt variations */
 
 #ifndef M_SQRT2_F
-#define M_SQRT2_F	((float)1.41421356237309504880) 					/* sqrt(2) */
+#define M_SQRT2_F (1.4142135623730950f)  /* sqrt(2) */
 #endif
 
 #ifndef M_LN2_F
-#define M_LN2_F      ((float)0.6931471805599453)        /* ln(2) */
+#define M_LN2_F   (0.6931471805599453f)  /* ln(2) */
 #endif
 
 #ifndef M_LN10_F
-#define M_LN10_F     ((float)2.3025850929940457)        /* ln(10) */
+#define M_LN10_F  (2.3025850929940457f)  /* ln(10) */
 #endif
 
 /* Scalar */
@@ -774,6 +774,7 @@ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __force
 	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
 }
 
+#if defined(__KERNEL_SSE3__)
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
 {
 	return _mm_moveldup_ps(b);
@@ -783,6 +784,7 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
 {
 	return _mm_movehdup_ps(b);
 }
+#endif
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
 {
@@ -1241,19 +1243,6 @@ ccl_device_inline float __uint_as_float(uint i)
 	return u.f;
 }
 
-/* Versions of functions which are safe for fast math. */
-ccl_device_inline bool isnan_safe(float f)
-{
-	unsigned int x = __float_as_uint(f);
-	return (x << 1) > 0xff000000u;
-}
-
-ccl_device_inline bool isfinite_safe(float f)
-{
-	/* By IEEE 754 rule, 2*Inf equals Inf */
-	unsigned int x = __float_as_uint(f);
-	return (f == f) && (x == 0 || (f != 2.0f*f));
-}
 
 /* Interpolation */
 
@@ -1271,6 +1260,20 @@ ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const
 
 #endif
 
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
+{
+	unsigned int x = __float_as_uint(f);
+	return (x << 1) > 0xff000000u;
+}
+
+ccl_device_inline bool isfinite_safe(float f)
+{
+	/* By IEEE 754 rule, 2*Inf equals Inf */
+	unsigned int x = __float_as_uint(f);
+	return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
+}
+
 /* Orthonormal vectors */
 
 ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b)
@@ -1329,7 +1332,7 @@ ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
 	y = (b.y != 0.0f)? a.y/b.y: 0.0f;
 	z = (b.z != 0.0f)? a.z/b.z: 0.0f;
 
-	/* try to get grey even if b is zero */
+	/* try to get gray even if b is zero */
 	if(b.x == 0.0f) {
 		if(b.y == 0.0f) {
 			x = z;
@@ -1451,181 +1454,9 @@ ccl_device_inline float beta(float x, float y)
 #endif
 }
 
-/* Ray Intersection */
-
-ccl_device bool ray_sphere_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 sphere_P, float sphere_radius,
-	float3 *isect_P, float *isect_t)
-{
-	float3 d = sphere_P - ray_P;
-	float radiussq = sphere_radius*sphere_radius;
-	float tsq = dot(d, d);
-
-	if(tsq > radiussq) { /* ray origin outside sphere */
-		float tp = dot(d, ray_D);
-
-		if(tp < 0.0f) /* dir points away from sphere */
-			return false;
-
-		float dsq = tsq - tp*tp; /* pythagoras */
-
-		if(dsq > radiussq) /* closest point on ray outside sphere */
-			return false;
-
-		float t = tp - sqrtf(radiussq - dsq); /* pythagoras */
-
-		if(t < ray_t) {
-			*isect_t = t;
-			*isect_P = ray_P + ray_D*t;
-			return true;
-		}
-	}
-
-	return false;
-}
-
-ccl_device bool ray_aligned_disk_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 disk_P, float disk_radius,
-	float3 *isect_P, float *isect_t)
+ccl_device_inline float xor_signmask(float x, int y)
 {
-	/* aligned disk normal */
-	float disk_t;
-	float3 disk_N = normalize_len(ray_P - disk_P, &disk_t);
-	float div = dot(ray_D, disk_N);
-
-	if(UNLIKELY(div == 0.0f))
-		return false;
-
-	/* compute t to intersection point */
-	float t = -disk_t/div;
-	if(t < 0.0f || t > ray_t)
-		return false;
-	
-	/* test if within radius */
-	float3 P = ray_P + ray_D*t;
-	if(len_squared(P - disk_P) > disk_radius*disk_radius)
-		return false;
-
-	*isect_P = P;
-	*isect_t = t;
-
-	return true;
-}
-
-ccl_device bool ray_triangle_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 v0, float3 v1, float3 v2,
-	float3 *isect_P, float *isect_t)
-{
-	/* Calculate intersection */
-	float3 e1 = v1 - v0;
-	float3 e2 = v2 - v0;
-	float3 s1 = cross(ray_D, e2);
-
-	const float divisor = dot(s1, e1);
-	if(UNLIKELY(divisor == 0.0f))
-		return false;
-
-	const float invdivisor = 1.0f/divisor;
-
-	/* compute first barycentric coordinate */
-	const float3 d = ray_P - v0;
-	const float u = dot(d, s1)*invdivisor;
-	if(u < 0.0f)
-		return false;
-
-	/* Compute second barycentric coordinate */
-	const float3 s2 = cross(d, e1);
-	const float v = dot(ray_D, s2)*invdivisor;
-	if(v < 0.0f)
-		return false;
-
-	const float b0 = 1.0f - u - v;
-	if(b0 < 0.0f)
-		return false;
-
-	/* compute t to intersection point */
-	const float t = dot(e2, s2)*invdivisor;
-	if(t < 0.0f || t > ray_t)
-		return false;
-
-	*isect_t = t;
-	*isect_P = ray_P + ray_D*t;
-
-	return true;
-}
-
-ccl_device_inline bool ray_triangle_intersect_uv(
-        float3 ray_P, float3 ray_D, float ray_t,
-        float3 v0, float3 v1, float3 v2,
-        float *isect_u, float *isect_v, float *isect_t)
-{
-	/* Calculate intersection */
-	float3 e1 = v1 - v0;
-	float3 e2 = v2 - v0;
-	float3 s1 = cross(ray_D, e2);
-
-	const float divisor = dot(s1, e1);
-	if(UNLIKELY(divisor == 0.0f))
-		return false;
-
-	const float invdivisor = 1.0f/divisor;
-
-	/* compute first barycentric coordinate */
-	const float3 d = ray_P - v0;
-	const float u = dot(d, s1)*invdivisor;
-	if(u < 0.0f)
-		return false;
-
-	/* Compute second barycentric coordinate */
-	const float3 s2 = cross(d, e1);
-	const float v = dot(ray_D, s2)*invdivisor;
-	if(v < 0.0f)
-		return false;
-
-	const float b0 = 1.0f - u - v;
-	if(b0 < 0.0f)
-		return false;
-
-	/* compute t to intersection point */
-	const float t = dot(e2, s2)*invdivisor;
-	if(t < 0.0f || t > ray_t)
-		return false;
-
-	*isect_u = u;
-	*isect_v = v;
-	*isect_t = t;
-
-	return true;
-}
-
-ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_mint, float ray_maxt,
-                                   float3 quad_P, float3 quad_u, float3 quad_v, float3 quad_n,
-                                   float3 *isect_P, float *isect_t, float *isect_u, float *isect_v)
-{
-	float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
-	if(t < ray_mint || t > ray_maxt)
-		return false;
-
-	float3 hit = ray_P + t*ray_D;
-	float3 inplane = hit - quad_P;
-
-	float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f;
-	if(u < 0.0f || u > 1.0f)
-		return false;
-
-	float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f;
-	if(v < 0.0f || v > 1.0f)
-		return false;
-
-	if(isect_P) *isect_P = hit;
-	if(isect_t) *isect_t = t;
-	if(isect_u) *isect_u = u;
-	if(isect_v) *isect_v = v;
-
-	return true;
+	return __int_as_float(__float_as_int(x) ^ y);
 }
 
 /* projections */
@@ -1690,4 +1521,3 @@ ccl_device_inline int util_max_axis(float3 vec)
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_H__ */
-
diff --git a/intern/cycles/util/util_math_cdf.cpp b/intern/cycles/util/util_math_cdf.cpp
index ec78ca15d88..c14d4793ea1 100644
--- a/intern/cycles/util/util_math_cdf.cpp
+++ b/intern/cycles/util/util_math_cdf.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "util_math_cdf.h"
+#include "util/util_math_cdf.h"
 
-#include "util_algorithm.h"
-#include "util_math.h"
+#include "util/util_algorithm.h"
+#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_math_cdf.h b/intern/cycles/util/util_math_cdf.h
index 47dfb68ba44..79643fe26e3 100644
--- a/intern/cycles/util/util_math_cdf.h
+++ b/intern/cycles/util/util_math_cdf.h
@@ -17,9 +17,9 @@
 #ifndef __UTIL_MATH_CDF_H__
 #define __UTIL_MATH_CDF_H__
 
-#include "util_algorithm.h"
-#include "util_math.h"
-#include "util_vector.h"
+#include "util/util_algorithm.h"
+#include "util/util_math.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h
new file mode 100644
index 00000000000..2b65a0dfa48
--- /dev/null
+++ b/intern/cycles/util/util_math_intersect.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INTERSECT_H__
+#define __UTIL_MATH_INTERSECT_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray Intersection */
+
+ccl_device bool ray_sphere_intersect(
+        float3 ray_P, float3 ray_D, float ray_t,
+        float3 sphere_P, float sphere_radius,
+        float3 *isect_P, float *isect_t)
+{
+	const float3 d = sphere_P - ray_P;
+	const float radiussq = sphere_radius*sphere_radius;
+	const float tsq = dot(d, d);
+
+	if(tsq > radiussq) {
+		/* Ray origin outside sphere. */
+		const float tp = dot(d, ray_D);
+		if(tp < 0.0f) {
+			/* Ray  points away from sphere. */
+			return false;
+		}
+		const float dsq = tsq - tp*tp;  /* pythagoras */
+		if(dsq > radiussq)  {
+			/* Closest point on ray outside sphere. */
+			return false;
+		}
+		const float t = tp - sqrtf(radiussq - dsq);  /* pythagoras */
+		if(t < ray_t) {
+			*isect_t = t;
+			*isect_P = ray_P + ray_D*t;
+			return true;
+		}
+	}
+	return false;
+}
+
+ccl_device bool ray_aligned_disk_intersect(
+        float3 ray_P, float3 ray_D, float ray_t,
+        float3 disk_P, float disk_radius,
+        float3 *isect_P, float *isect_t)
+{
+	/* Aligned disk normal. */
+	float disk_t;
+	const float3 disk_N = normalize_len(ray_P - disk_P, &disk_t);
+	const float div = dot(ray_D, disk_N);
+	if(UNLIKELY(div == 0.0f)) {
+		return false;
+	}
+	/* Compute t to intersection point. */
+	const float t = -disk_t/div;
+	if(t < 0.0f || t > ray_t) {
+		return false;
+	}
+	/* Test if within radius. */
+	float3 P = ray_P + ray_D*t;
+	if(len_squared(P - disk_P) > disk_radius*disk_radius) {
+		return false;
+	}
+	*isect_P = P;
+	*isect_t = t;
+	return true;
+}
+
+#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
+ccl_device_inline
+#else
+ccl_device_forceinline
+#endif
+bool ray_triangle_intersect(
+        float3 ray_P, float3 ray_dir, float ray_t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+        const ssef *ssef_verts,
+#else
+        const float3 tri_a, const float3 tri_b, const float3 tri_c,
+#endif
+        float *isect_u, float *isect_v, float *isect_t)
+{
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	typedef ssef float3;
+	const float3 tri_a(ssef_verts[0]);
+	const float3 tri_b(ssef_verts[1]);
+	const float3 tri_c(ssef_verts[2]);
+	const float3 P(ray_P);
+	const float3 dir(ray_dir);
+#else
+#  define dot3(a, b) dot(a, b)
+	const float3 P = ray_P;
+	const float3 dir = ray_dir;
+#endif
+
+	/* Calculate vertices relative to ray origin. */
+	const float3 v0 = tri_c - P;
+	const float3 v1 = tri_a - P;
+	const float3 v2 = tri_b - P;
+
+	/* Calculate triangle edges. */
+	const float3 e0 = v2 - v0;
+	const float3 e1 = v0 - v1;
+	const float3 e2 = v1 - v2;
+
+	/* Perform edge tests. */
+#ifdef __KERNEL_SSE2__
+	const float3 crossU = cross(v2 + v0, e0);
+	const float3 crossV = cross(v0 + v1, e1);
+	const float3 crossW = cross(v1 + v2, e2);
+#  ifndef __KERNEL_SSE__
+	const ssef crossX(crossU.x, crossV.x, crossW.x, crossW.x);
+	const ssef crossY(crossU.y, crossV.y, crossW.y, crossW.y);
+	const ssef crossZ(crossU.z, crossV.z, crossW.z, crossW.z);
+#  else
+	ssef crossX(crossU);
+	ssef crossY(crossV);
+	ssef crossZ(crossW);
+	ssef zero = _mm_setzero_ps();
+	_MM_TRANSPOSE4_PS(crossX, crossY, crossZ, zero);
+#  endif
+	const ssef dirX(ray_dir.x);
+	const ssef dirY(ray_dir.y);
+	const ssef dirZ(ray_dir.z);
+	/*const*/ ssef UVWW = crossX*dirX + crossY*dirY + crossZ*dirZ;
+	const float minUVW = reduce_min(UVWW);
+	const float maxUVW = reduce_max(UVWW);
+#else  /* __KERNEL_SSE2__ */
+	const float U = dot(cross(v2 + v0, e0), ray_dir);
+	const float V = dot(cross(v0 + v1, e1), ray_dir);
+	const float W = dot(cross(v1 + v2, e2), ray_dir);
+	const float minUVW = min(U, min(V, W));
+	const float maxUVW = max(U, max(V, W));
+#endif  /* __KERNEL_SSE2__ */
+
+	if(minUVW < 0.0f && maxUVW > 0.0f) {
+		return false;
+	}
+
+	/* Calculate geometry normal and denominator. */
+	const float3 Ng1 = cross(e1, e0);
+	//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+	const float3 Ng = Ng1 + Ng1;
+	const float den = dot3(Ng, dir);
+	/* Avoid division by 0. */
+	if(UNLIKELY(den == 0.0f)) {
+		return false;
+	}
+
+	/* Perform depth test. */
+	const float T = dot3(v0, Ng);
+	const int sign_den = (__float_as_int(den) & 0x80000000);
+	const float sign_T = xor_signmask(T, sign_den);
+	if((sign_T < 0.0f) ||
+	   (sign_T > ray_t * xor_signmask(den, sign_den)))
+	{
+		return false;
+	}
+
+	const float inv_den = 1.0f / den;
+#ifdef __KERNEL_SSE2__
+	UVWW *= inv_den;
+	_mm_store_ss(isect_u, UVWW);
+	_mm_store_ss(isect_v, shuffle<1,1,3,3>(UVWW));
+#else
+	*isect_u = U * inv_den;
+	*isect_v = V * inv_den;
+#endif
+	*isect_t = T * inv_den;
+	return true;
+
+#undef dot3
+}
+
+ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D,
+                                   float ray_mint, float ray_maxt,
+                                   float3 quad_P,
+                                   float3 quad_u, float3 quad_v, float3 quad_n,
+                                   float3 *isect_P, float *isect_t,
+                                   float *isect_u, float *isect_v)
+{
+	/* Perform intersection test. */
+	float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
+	if(t < ray_mint || t > ray_maxt) {
+		return false;
+	}
+	const float3 hit = ray_P + t*ray_D;
+	const float3 inplane = hit - quad_P;
+	const float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f;
+	if(u < 0.0f || u > 1.0f) {
+		return false;
+	}
+	const float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f;
+	if(v < 0.0f || v > 1.0f) {
+		return false;
+	}
+	/* Store the result. */
+	/* TODO(sergey): Check whether we can avoid some checks here. */
+	if(isect_P != NULL) *isect_P = hit;
+	if(isect_t != NULL) *isect_t = t;
+	if(isect_u != NULL) *isect_u = u;
+	if(isect_v != NULL) *isect_v = v;
+	return true;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INTERSECT_H__ */
diff --git a/intern/cycles/util/util_md5.h b/intern/cycles/util/util_md5.h
index d0af9fdb004..e4cd66c85b0 100644
--- a/intern/cycles/util/util_md5.h
+++ b/intern/cycles/util/util_md5.h
@@ -30,8 +30,8 @@
 #ifndef __UTIL_MD5_H__
 #define __UTIL_MD5_H__
 
-#include "util_string.h"
-#include "util_types.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index adc141a7b28..6f70a474fe7 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -111,7 +111,7 @@
 
 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
  * Since we can't avoid including <windows.h>, better only include that */
-#include "util_windows.h"
+#include "util/util_windows.h"
 
 #endif
 
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 5df262fcbbb..cd3067f7650 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_string.h"
+#include "util/util_debug.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
 
 #include <OpenImageIO/filesystem.h>
 #include <OpenImageIO/strutil.h>
@@ -45,7 +45,7 @@ OIIO_NAMESPACE_USING
 #  include <shlwapi.h>
 #endif
 
-#include "util_windows.h"
+#include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -320,17 +320,18 @@ static char *path_specials(const string& sub)
 {
 	static bool env_init = false;
 	static char *env_shader_path;
-	static char *env_kernel_path;
+	static char *env_source_path;
 	if(!env_init) {
 		env_shader_path = getenv("CYCLES_SHADER_PATH");
-		env_kernel_path = getenv("CYCLES_KERNEL_PATH");
+		/* NOTE: It is KERNEL in env variable for compatibility reasons. */
+		env_source_path = getenv("CYCLES_KERNEL_PATH");
 		env_init = true;
 	}
 	if(env_shader_path != NULL && sub == "shader") {
 		return env_shader_path;
 	}
-	else if(env_shader_path != NULL && sub == "kernel") {
-		return env_kernel_path;
+	else if(env_shader_path != NULL && sub == "source") {
+		return env_source_path;
 	}
 	return NULL;
 }
@@ -814,7 +815,7 @@ string path_source_replace_includes(const string& source,
 						/* Use line directives for better error messages. */
 						line = line_directive(filepath, 1)
 						     + token.replace(0, n_end + 1, "\n" + text + "\n")
-						     + line_directive(path_join(path, source_filename), i);
+						     + line_directive(path_join(path, source_filename), i + 1);
 					}
 				}
 			}
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 70dbb5ae403..0e5e2d2c837 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -24,10 +24,10 @@
 
 #include <stdio.h>
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 14215056840..39c1eed04e7 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -23,10 +23,10 @@
  * update notifications from a job running in another thread. All methods
  * except for the constructor/destructor are thread safe. */
 
-#include "util_function.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_thread.h"
+#include "util/util_function.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_simd.cpp b/intern/cycles/util/util_simd.cpp
index de2df612578..f90439c188b 100644
--- a/intern/cycles/util/util_simd.cpp
+++ b/intern/cycles/util/util_simd.cpp
@@ -19,7 +19,7 @@
     (defined(WITH_KERNEL_NATIVE) && defined(__SSE2__))
 
 #define __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 756bd15ed25..557809a5719 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -20,8 +20,8 @@
 
 #include <limits>
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -451,11 +451,11 @@ ccl_device_inline int bitscan(int value)
 
 CCL_NAMESPACE_END
 
-#include "util_math.h"
-#include "util_sseb.h"
-#include "util_ssei.h"
-#include "util_ssef.h"
-#include "util_avxf.h"
+#include "util/util_math.h"
+#include "util/util_sseb.h"
+#include "util/util_ssei.h"
+#include "util/util_ssef.h"
+#include "util/util_avxf.h"
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_sky_model.cpp b/intern/cycles/util/util_sky_model.cpp
index 5730986cc4f..6dda8469907 100644
--- a/intern/cycles/util/util_sky_model.cpp
+++ b/intern/cycles/util/util_sky_model.cpp
@@ -97,8 +97,8 @@ All instructions on how to use this code are in the accompanying header file.
 
 */
 
-#include "util_sky_model.h"
-#include "util_sky_model_data.h"
+#include "util/util_sky_model.h"
+#include "util/util_sky_model_data.h"
 
 #include <assert.h>
 #include <stdio.h>
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index 2f5295b5463..cf99a08efae 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -514,12 +514,12 @@ ccl_device_inline float len3(const ssef& a)
 /* faster version for SSSE3 */
 typedef ssei shuffle_swap_t;
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
 {
 	return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 }
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
 {
 	return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
 }
@@ -534,12 +534,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s
 /* somewhat slower version for SSE2 */
 typedef int shuffle_swap_t;
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
 {
 	return 0;
 }
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
 {
 	return 1;
 }
diff --git a/intern/cycles/util/util_stack_allocator.h b/intern/cycles/util/util_stack_allocator.h
index d7aab5b250c..2f1799a739e 100644
--- a/intern/cycles/util/util_stack_allocator.h
+++ b/intern/cycles/util/util_stack_allocator.h
@@ -20,8 +20,8 @@
 #include <cstddef>
 #include <memory>
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index 033d85e8ec6..e90049254de 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -43,7 +43,9 @@ template <> class StaticAssertFailure<true> {};
 #    endif  /* __COUNTER__ */
 #  endif  /* C++11 or MSVC2015 */
 #else  /* __KERNEL_GPU__ */
-#  define static_assert(statement, message)
+#  ifndef static_assert
+#    define static_assert(statement, message)
+#  endif
 #endif  /* __KERNEL_GPU__ */
 
 /* TODO(sergey): For until C++11 is a bare minimum for us,
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index c21a8488c81..baba549753d 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_STATS_H__
 #define __UTIL_STATS_H__
 
-#include "util_atomic.h"
+#include "util/util_atomic.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index 5594aa8edb6..a1008d510d1 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -17,9 +17,9 @@
 #include <stdarg.h>
 #include <stdio.h>
 
-#include "util_foreach.h"
-#include "util_string.h"
-#include "util_windows.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32
 #  ifndef vsnprintf
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index 7aeed96f00b..e2c105db9c1 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <sstream>
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 87d885c44cf..a942d738b8a 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "util_system.h"
+#include "util/util_system.h"
 
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_types.h"
-#include "util_string.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_types.h"
+#include "util/util_string.h"
 
 #ifdef _WIN32
 #  if(!defined(FREE_WINDOWS))
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index ff61b260bed..db7a45b2d59 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_SYSTEM_H__
 #define __UTIL_SYSTEM_H__
 
-#include "util_string.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 0d1fed3ebbf..fb0c34e1dc4 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_system.h"
-#include "util_task.h"
-#include "util_time.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_time.h"
 
 //#define THREADING_DEBUG_ENABLED
 
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
index 0b82f14f66f..3ebfb007e40 100644
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@@ -17,10 +17,10 @@
 #ifndef __UTIL_TASK_H__
 #define __UTIL_TASK_H__
 
-#include "util_list.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 3db8b4bd197..3dcb09804b0 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "util_thread.h"
+#include "util/util_thread.h"
 
-#include "util_system.h"
-#include "util_windows.h"
+#include "util/util_system.h"
+#include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 427c633d2ce..1b4e87ebf03 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -32,7 +32,7 @@
 #  include <libkern/OSAtomic.h>
 #endif
 
-#include "util_function.h"
+#include "util/util_function.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_time.cpp b/intern/cycles/util/util_time.cpp
index 59c963cfafb..7c39aa294bf 100644
--- a/intern/cycles/util/util_time.cpp
+++ b/intern/cycles/util/util_time.cpp
@@ -16,8 +16,8 @@
 
 #include <stdlib.h>
 
-#include "util_time.h"
-#include "util_windows.h"
+#include "util/util_time.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32
 
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 2f10540c94e..b8f182ae962 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -46,10 +46,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util_transform.h"
+#include "util/util_transform.h"
 
-#include "util_boundbox.h"
-#include "util_math.h"
+#include "util/util_boundbox.h"
+#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index a0695f20488..aef168ca64d 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -21,8 +21,8 @@
 #include <string.h>
 #endif
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a000fae4bd6..bf4a134b998 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -37,6 +37,9 @@
 #define ccl_device_noinline static
 #define ccl_global
 #define ccl_constant
+#define ccl_local
+#define ccl_local_param
+#define ccl_private
 #define ccl_restrict __restrict
 #define __KERNEL_WITH_SSE_ALIGN__
 
@@ -82,7 +85,7 @@
 
 /* SIMD Types */
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #endif
 
@@ -103,10 +106,16 @@ typedef unsigned int uint;
 
 #endif
 
-#ifndef __KERNEL_GPU__
-
 /* Fixed Bits Types */
 
+#ifdef __KERNEL_OPENCL__
+
+typedef ulong uint64_t;
+
+#endif
+
+#ifndef __KERNEL_GPU__
+
 #ifdef _WIN32
 
 typedef signed char int8_t;
@@ -171,7 +180,7 @@ struct ccl_try_align(16) int3 {
 	};
 
 	__forceinline int3() {}
-	__forceinline int3(const __m128i a) : m128(a) {}
+	__forceinline int3(const __m128i& a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
 
@@ -193,7 +202,7 @@ struct ccl_try_align(16) int4 {
 	};
 
 	__forceinline int4() {}
-	__forceinline int4(const __m128i a) : m128(a) {}
+	__forceinline int4(const __m128i& a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
 
@@ -265,7 +274,7 @@ struct ccl_try_align(16) float4 {
 	};
 
 	__forceinline float4() {}
-	__forceinline float4(const __m128 a) : m128(a) {}
+	__forceinline float4(const __m128& a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
 
@@ -397,11 +406,6 @@ ccl_device_inline float4 make_float4(float x, float y, float z, float w)
 	return a;
 }
 
-ccl_device_inline int align_up(int offset, int alignment)
-{
-	return (offset + alignment - 1) & ~(alignment - 1);
-}
-
 ccl_device_inline int3 make_int3(int i)
 {
 #ifdef __KERNEL_SSE__
@@ -476,6 +480,21 @@ ccl_device_inline int4 make_int4(const float3& f)
 
 #endif
 
+ccl_device_inline size_t align_up(size_t offset, size_t alignment)
+{
+	return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+ccl_device_inline size_t round_up(size_t x, size_t multiple)
+{
+	return ((x + multiple - 1) / multiple) * multiple;
+}
+
+ccl_device_inline size_t round_down(size_t x, size_t multiple)
+{
+	return (x / multiple) * multiple;
+}
+
 /* Interpolation types for textures
  * cuda also use texture space to store other objects */
 enum InterpolationType {
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 546b17570bb..4add91a3368 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -23,9 +23,9 @@
 #include <cstring>
 #include <vector>
 
-#include "util_aligned_malloc.h"
-#include "util_guarded_allocator.h"
-#include "util_types.h"
+#include "util/util_aligned_malloc.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index 9796a5f896d..10d86167921 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -17,11 +17,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "util_opengl.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_version.h"
-#include "util_view.h"
+#include "util/util_opengl.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_version.h"
+#include "util/util_view.h"
 
 #ifdef __APPLE__
 #include <GLUT/glut.h>
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
index 4de8483564b..073db2a27db 100644
--- a/intern/cycles/util/util_windows.cpp
+++ b/intern/cycles/util/util_windows.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "util_windows.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32