318 files changed, 9225 insertions, 5846 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1cc3e02e03a..1325ab7124b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -519,18 +519,20 @@ endif()
 option(WITH_LEGACY_DEPSGRAPH "Build Blender with legacy dependency graph" ON)
 mark_as_advanced(WITH_LEGACY_DEPSGRAPH)
 
-# Use hardcoded paths or find_package to find externals
-option(WITH_WINDOWS_FIND_MODULES "Use find_package to locate libraries" OFF)
-mark_as_advanced(WITH_WINDOWS_FIND_MODULES)
+if(WIN32)
+	# Use hardcoded paths or find_package to find externals
+	option(WITH_WINDOWS_FIND_MODULES "Use find_package to locate libraries" OFF)
+	mark_as_advanced(WITH_WINDOWS_FIND_MODULES)
 
-option(WITH_WINDOWS_CODESIGN "Use signtool to sign the final binary." OFF)
-mark_as_advanced(WITH_WINDOWS_CODESIGN)
+	option(WITH_WINDOWS_CODESIGN "Use signtool to sign the final binary." OFF)
+	mark_as_advanced(WITH_WINDOWS_CODESIGN)
 
-set(WINDOWS_CODESIGN_PFX CACHE FILEPATH  "Path to pfx file to use for codesigning.")
-mark_as_advanced(WINDOWS_CODESIGN_PFX)
+	set(WINDOWS_CODESIGN_PFX CACHE FILEPATH  "Path to pfx file to use for codesigning.")
+	mark_as_advanced(WINDOWS_CODESIGN_PFX)
 
-set(WINDOWS_CODESIGN_PFX_PASSWORD CACHE STRING  "password for pfx file used for codesigning.")
-mark_as_advanced(WINDOWS_CODESIGN_PFX_PASSWORD)
+	set(WINDOWS_CODESIGN_PFX_PASSWORD CACHE STRING  "password for pfx file used for codesigning.")
+	mark_as_advanced(WINDOWS_CODESIGN_PFX_PASSWORD)
+endif()
 
 # avoid using again
 option_defaults_clear()
diff --git a/build_files/cmake/packaging.cmake b/build_files/cmake/packaging.cmake
index c7063ed6772..e8621bc457a 100644
--- a/build_files/cmake/packaging.cmake
+++ b/build_files/cmake/packaging.cmake
@@ -1,5 +1,7 @@
-set(PROJECT_DESCRIPTION  "Blender is a very fast and versatile 3D modeller/renderer.")
-set(PROJECT_COPYRIGHT    "Copyright (C) 2001-2012 Blender Foundation")
+string(TIMESTAMP CURRENT_YEAR "%Y")
+
+set(PROJECT_DESCRIPTION  "Blender is the free and open source 3D creation suite software.")
+set(PROJECT_COPYRIGHT    "Copyright (C) 2001-${CURRENT_YEAR} Blender Foundation")
 set(PROJECT_CONTACT      "foundation@blender.org")
 set(PROJECT_VENDOR       "Blender Foundation")
 
@@ -135,4 +137,3 @@ unset(MINOR_VERSION)
 unset(PATCH_VERSION)
 
 unset(BUILD_REV)
-
diff --git a/doc/python_api/rst/bge.texture.rst b/doc/python_api/rst/bge.texture.rst
index 49f6c4469a4..3028ee653f8 100644
--- a/doc/python_api/rst/bge.texture.rst
+++ b/doc/python_api/rst/bge.texture.rst
@@ -681,7 +681,7 @@ Image classes
 
    .. attribute:: zbuff
 
-      Use depth component of render as grey scale color -  suitable for texture source.
+      Use depth component of render as grayscale color - suitable for texture source.
 
       :type: bool
 
@@ -817,7 +817,7 @@ Image classes
 
    .. attribute:: zbuff
 
-      Use depth component of viewport as grey scale color - suitable for texture source.
+      Use depth component of viewport as grayscale color - suitable for texture source.
 
       :type: bool
 
@@ -1260,8 +1260,8 @@ Filter classes
 
 .. class:: FilterGray
 
-   Filter for gray scale effect.
-   Proportions of R, G and B contributions in the output gray scale are 28:151:77.
+   Filter for grayscale effect.
+   Proportions of R, G and B contributions in the output grayscale are 28:151:77.
 
    .. attribute:: previous
 
diff --git a/doc/python_api/sphinx_doc_gen.py b/doc/python_api/sphinx_doc_gen.py
index ec3131ca19e..47bb323e574 100644
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -427,9 +427,9 @@ if BLENDER_REVISION != "Unknown":
     BLENDER_VERSION_DOTS += " " + BLENDER_REVISION          # '2.62.1 SHA1'
 
 BLENDER_VERSION_PATH = "_".join(blender_version_strings)    # '2_62_1'
-if bpy.app.version_cycle == "release":
-    BLENDER_VERSION_PATH = "%s%s_release" % ("_".join(blender_version_strings[:2]),
-                                             bpy.app.version_char)   # '2_62_release'
+if bpy.app.version_cycle in {"rc", "release"}:
+    # '2_62a_release'
+    BLENDER_VERSION_PATH = "%s%s_release" % ("_".join(blender_version_strings[:2]), bpy.app.version_char)
 
 # --------------------------DOWNLOADABLE FILES----------------------------------
 
diff --git a/doc/python_api/sphinx_doc_update.py b/doc/python_api/sphinx_doc_update.py
index 3d48c1145e1..561e58dec66 100755
--- a/doc/python_api/sphinx_doc_update.py
+++ b/doc/python_api/sphinx_doc_update.py
@@ -96,6 +96,11 @@ def main():
 
     rsync_base = "rsync://%s@%s:%s" % (args.user, args.rsync_server, args.rsync_root)
 
+    blenver = blenver_zip = ""
+    api_name = ""
+    branch = ""
+    is_release = False
+
     # I) Update local mirror using rsync.
     rsync_mirror_cmd = ("rsync", "--delete-after", "-avzz", rsync_base, args.mirror_dir)
     subprocess.run(rsync_mirror_cmd, env=dict(os.environ, RSYNC_PASSWORD=args.password))
@@ -108,19 +113,24 @@ def main():
         subprocess.run(doc_gen_cmd)
 
         # III) Get Blender version info.
-        blenver = blenver_zip = ""
         getver_file = os.path.join(tmp_dir, "blendver.txt")
         getver_script = (""
             "import sys, bpy\n"
             "with open(sys.argv[-1], 'w') as f:\n"
-            "    f.write('%d_%d%s_release\\n' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
-            "            if bpy.app.version_cycle in {'rc', 'release'} else '%d_%d_%d\\n' % bpy.app.version)\n"
-            "    f.write('%d_%d_%d' % bpy.app.version)\n")
+            "    is_release = bpy.app.version_cycle in {'rc', 'release'}\n"
+            "    branch = bpy.app.build_branch.split()[0].decode()\n"
+            "    f.write('%d\\n' % is_release)\n"
+            "    f.write('%s\\n' % branch)\n"
+            "    f.write('%d.%d%s\\n' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
+            "            if is_release else '%s\\n' % branch)\n"
+            "    f.write('%d_%d%s_release' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
+            "            if is_release else '%d_%d_%d' % bpy.app.version)\n")
         get_ver_cmd = (args.blender, "--background", "-noaudio", "--factory-startup", "--python-exit-code", "1",
                        "--python-expr", getver_script, "--", getver_file)
         subprocess.run(get_ver_cmd)
         with open(getver_file) as f:
-            blenver, blenver_zip = f.read().split("\n")
+            is_release, branch, blenver, blenver_zip = f.read().split("\n")
+            is_release = bool(int(is_release))
         os.remove(getver_file)
 
         # IV) Build doc.
@@ -132,7 +142,7 @@ def main():
         os.chdir(curr_dir)
 
         # V) Cleanup existing matching dir in server mirror (if any), and copy new doc.
-        api_name = "blender_python_api_%s" % blenver
+        api_name = blenver
         api_dir = os.path.join(args.mirror_dir, api_name)
         if os.path.exists(api_dir):
             shutil.rmtree(api_dir)
@@ -150,19 +160,15 @@ def main():
     os.rename(zip_path, os.path.join(api_dir, "%s.zip" % zip_name))
 
     # VII) Create symlinks and html redirects.
-    #~ os.symlink(os.path.join(DEFAULT_SYMLINK_ROOT, api_name, "contents.html"), os.path.join(api_dir, "index.html"))
     os.symlink("./contents.html", os.path.join(api_dir, "index.html"))
-    if blenver.endswith("release"):
-        symlink = os.path.join(args.mirror_dir, "blender_python_api_current")
+    if is_release:
+        symlink = os.path.join(args.mirror_dir, "current")
         os.remove(symlink)
         os.symlink("./%s" % api_name, symlink)
         with open(os.path.join(args.mirror_dir, "250PythonDoc/index.html"), 'w') as f:
             f.write("<html><head><title>Redirecting...</title><meta http-equiv=\"REFRESH\""
                     "content=\"0;url=../%s/\"></head><body>Redirecting...</body></html>" % api_name)
-    else:
-        symlink = os.path.join(args.mirror_dir, "blender_python_api_master")
-        os.remove(symlink)
-        os.symlink("./%s" % api_name, symlink)
+    elif branch == "master":
         with open(os.path.join(args.mirror_dir, "blender_python_api/index.html"), 'w') as f:
             f.write("<html><head><title>Redirecting...</title><meta http-equiv=\"REFRESH\""
                     "content=\"0;url=../%s/\"></head><body>Redirecting...</body></html>" % api_name)
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h
index 19087117667..4cce29d38ab 100644
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -114,7 +114,7 @@ extern "C" {
 #define cuGLGetDevices cuGLGetDevices_v2
 
 /* Types. */
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
 typedef unsigned long long CUdeviceptr;
 #else
 typedef unsigned int CUdeviceptr;
diff --git a/intern/atomic/atomic_ops.h b/intern/atomic/atomic_ops.h
index 1107deddf94..1e9528f9ed9 100644
--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -101,11 +101,11 @@ ATOMIC_INLINE size_t atomic_fetch_and_add_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_fetch_and_sub_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new);
 
-ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new);
+ATOMIC_INLINE unsigned int atomic_add_and_fetch_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_sub_and_fetch_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_fetch_and_add_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_fetch_and_sub_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsigned int _new);
 
 /* WARNING! Float 'atomics' are really faked ones, those are actually closer to some kind of spinlock-sync'ed operation,
  *          which means they are only efficient if collisions are highly unlikely (i.e. if probability of two threads
diff --git a/intern/atomic/intern/atomic_ops_ext.h b/intern/atomic/intern/atomic_ops_ext.h
index 8421aa72192..b72c94563fc 100644
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -113,58 +113,58 @@ ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new)
 
 /******************************************************************************/
 /* unsigned operations. */
-ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_add_and_fetch_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned int)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned int)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_sub_and_fetch_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned int)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned int)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_fetch_and_add_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned int)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned int)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_fetch_and_sub_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned int)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned int)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new)
+ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsigned int _new)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
+	return (unsigned int)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
+	return (unsigned int)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
 #endif
 }
 
diff --git a/intern/audaspace/intern/AUD_SoftwareDevice.cpp b/intern/audaspace/intern/AUD_SoftwareDevice.cpp
index 15594d340be..f9d65aa2363 100644
--- a/intern/audaspace/intern/AUD_SoftwareDevice.cpp
+++ b/intern/audaspace/intern/AUD_SoftwareDevice.cpp
@@ -365,6 +365,7 @@ bool AUD_SoftwareDevice::AUD_SoftwareHandle::seek(float position)
 	if(!m_status)
 		return false;
 
+	m_pitch->setPitch(m_user_pitch);
 	m_reader->seek((int)(position * m_reader->getSpecs().rate));
 
 	if(m_status == AUD_STATUS_STOPPED)
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5c51f9afc28..ca109734314 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -665,8 +665,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
         cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
         cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
+        cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
         cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+        cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
         cls.debug_opencl_kernel_type = EnumProperty(
             name="OpenCL Kernel Type",
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 44af5f7efed..7c1e3e270fb 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -1518,10 +1518,12 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         row.prop(cscene, "debug_use_cpu_avx", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
         col.prop(cscene, "debug_use_qbvh")
+        col.prop(cscene, "debug_use_cpu_split_kernel")
 
         col = layout.column()
         col.label('CUDA Flags:')
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
+        col.prop(cscene, "debug_use_cuda_split_kernel")
 
         col = layout.column()
         col.label('OpenCL Flags:')
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index e42ff5d72a6..ffa5b676917 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -411,6 +411,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}
 
+	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -434,8 +435,8 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
-			numverts += (CData->curve_keynum[curve] - 2)*2*resolution + resolution;
-			numtris += (CData->curve_keynum[curve] - 2)*resolution;
+			numverts += (CData->curve_keynum[curve] - 1)*resolution + resolution;
+			numtris += (CData->curve_keynum[curve] - 1)*2*resolution;
 		}
 	}
 
@@ -545,6 +546,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 		}
 	}
 
+	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -890,7 +892,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	}
 
 	/* obtain general settings */
-	bool use_curves = scene->curve_system_manager->use_curves;
+	const bool use_curves = scene->curve_system_manager->use_curves;
 
 	if(!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) {
 		if(!motion)
@@ -898,11 +900,11 @@ void BlenderSync::sync_curves(Mesh *mesh,
 		return;
 	}
 
-	int primitive = scene->curve_system_manager->primitive;
-	int triangle_method = scene->curve_system_manager->triangle_method;
-	int resolution = scene->curve_system_manager->resolution;
-	size_t vert_num = mesh->verts.size();
-	size_t tri_num = mesh->num_triangles();
+	const int primitive = scene->curve_system_manager->primitive;
+	const int triangle_method = scene->curve_system_manager->triangle_method;
+	const int resolution = scene->curve_system_manager->resolution;
+	const size_t vert_num = mesh->verts.size();
+	const size_t tri_num = mesh->num_triangles();
 	int used_res = 1;
 
 	/* extract particle hair data - should be combined with connecting to mesh later*/
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index e269be61791..fdc287084eb 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -564,7 +564,7 @@ static void attr_create_pointiness(Scene *scene,
 	 *         original vertex.
 	 */
 	vector<int> sorted_vert_indeices(num_verts);
-	for (int vert_index = 0; vert_index < num_verts; ++vert_index) {
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
 		sorted_vert_indeices[vert_index] = vert_index;
 	}
 	VertexAverageComparator compare(mesh->verts);
@@ -573,9 +573,9 @@ static void attr_create_pointiness(Scene *scene,
 	 * index.
 	 */
 	vector<int> vert_orig_index(num_verts);
-	for (int sorted_vert_index = 0;
-	     sorted_vert_index < num_verts;
-	     ++sorted_vert_index)
+	for(int sorted_vert_index = 0;
+	    sorted_vert_index < num_verts;
+	    ++sorted_vert_index)
 	{
 		const int vert_index = sorted_vert_indeices[sorted_vert_index];
 		const float3 &vert_co = mesh->verts[vert_index];
@@ -589,12 +589,12 @@ static void attr_create_pointiness(Scene *scene,
 			const float3 &other_vert_co = mesh->verts[other_vert_index];
 			/* We are too far away now, we wouldn't have duplicate. */
 			if ((other_vert_co.x + other_vert_co.y + other_vert_co.z) -
-			    (vert_co.x + vert_co.y + vert_co.z) > 0.0f)
+			    (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON)
 			{
 				break;
 			}
 			/* Found duplicate. */
-			if(other_vert_co == vert_co) {
+			if(len_squared(other_vert_co - vert_co) < FLT_EPSILON) {
 				found = true;
 				vert_orig_index[vert_index] = other_vert_index;
 				break;
@@ -777,6 +777,15 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
 			bool smooth = f->use_smooth() || use_loop_normals;
 
+			if(use_loop_normals) {
+				BL::Array<float, 12> loop_normals = f->split_normals();
+				for(int i = 0; i < n; i++) {
+					N[vi[i]] = make_float3(loop_normals[i * 3],
+					                       loop_normals[i * 3 + 1],
+					                       loop_normals[i * 3 + 2]);
+				}
+			}
+
 			/* Create triangles.
 			 *
 			 * NOTE: Autosmooth is already taken care about.
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 438abc49f88..75118c43747 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -67,8 +67,10 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
 	flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
 	flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
+	flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
 	/* Synchronize CUDA flags. */
 	flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
+	flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
 	/* Synchronize OpenCL kernel type. */
 	switch(get_enum(cscene, "debug_opencl_kernel_type")) {
 		case 0:
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index 8120de96362..23df3c1bc30 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -79,7 +79,7 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
 				me.calc_normals_split();
 			}
 			else {
-				me.split_faces();
+				me.split_faces(false);
 			}
 		}
 		if(subdivision_type == Mesh::SUBDIVISION_NONE) {
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 874a4246d1d..1fb2f371a0f 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -81,6 +81,7 @@ void BVH::build(Progress& progress)
 	                   pack.prim_type,
 	                   pack.prim_index,
 	                   pack.prim_object,
+	                   pack.prim_time,
 	                   params,
 	                   progress);
 	BVHNode *root = bvh_build.run();
@@ -256,6 +257,10 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	pack.leaf_nodes.resize(leaf_nodes_size);
 	pack.object_node.resize(objects.size());
 
+	if(params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0) {
+		pack.prim_time.resize(prim_index_size);
+	}
+
 	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
 	int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL;
 	int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL;
@@ -264,6 +269,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL;
 	int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL;
 	int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL;
+	float2 *pack_prim_time = (pack.prim_time.size())? &pack.prim_time[0]: NULL;
 
 	/* merge */
 	foreach(Object *ob, objects) {
@@ -309,6 +315,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 			int *bvh_prim_type = &bvh->pack.prim_type[0];
 			uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0];
 			uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0];
+			float2 *bvh_prim_time = bvh->pack.prim_time.size()? &bvh->pack.prim_time[0]: NULL;
 
 			for(size_t i = 0; i < bvh_prim_index_size; i++) {
 				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
@@ -324,6 +331,9 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 				pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
 				pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
 				pack_prim_object[pack_prim_index_offset] = 0;  // unused for instances
+				if(bvh_prim_time != NULL) {
+					pack_prim_time[pack_prim_index_offset] = bvh_prim_time[i];
+				}
 				pack_prim_index_offset++;
 			}
 		}
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 35f4d305883..08f41fc736f 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -68,6 +68,8 @@ struct PackedBVH {
 	array<int> prim_index;
 	/* mapping from BVH primitive index, to the object id of that primitive. */
 	array<int> prim_object;
+	/* Time range of BVH primitive. */
+	array<float2> prim_time;
 
 	/* index of the root node. */
 	int root_index;
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index a2f8b33cb0b..517afc75641 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -93,12 +93,14 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_,
                    array<int>& prim_type_,
                    array<int>& prim_index_,
                    array<int>& prim_object_,
+                   array<float2>& prim_time_,
                    const BVHParams& params_,
                    Progress& progress_)
  : objects(objects_),
    prim_type(prim_type_),
    prim_index(prim_index_),
    prim_object(prim_object_),
+   prim_time(prim_time_),
    params(params_),
    progress(progress_),
    progress_start_time(0.0),
@@ -465,6 +467,9 @@ BVHNode* BVHBuild::run()
 	}
 	spatial_free_index = 0;
 
+	need_prim_time = params.num_motion_curve_steps > 0 ||
+	                 params.num_motion_triangle_steps > 0;
+
 	/* init progress updates */
 	double build_start_time;
 	build_start_time = progress_start_time = time_dt();
@@ -475,6 +480,12 @@ BVHNode* BVHBuild::run()
 	prim_type.resize(references.size());
 	prim_index.resize(references.size());
 	prim_object.resize(references.size());
+	if(need_prim_time) {
+		prim_time.resize(references.size());
+	}
+	else {
+		prim_time.resize(0);
+	}
 
 	/* build recursively */
 	BVHNode *rootnode;
@@ -849,6 +860,9 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		prim_type[start] = ref->prim_type();
 		prim_index[start] = ref->prim_index();
 		prim_object[start] = ref->prim_object();
+		if(need_prim_time) {
+			prim_time[start] = make_float2(ref->time_from(), ref->time_to());
+		}
 
 		uint visibility = objects[ref->prim_object()]->visibility;
 		BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
@@ -891,11 +905,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 *    can not control.
 	 */
 	typedef StackAllocator<256, int> LeafStackAllocator;
+	typedef StackAllocator<256, float2> LeafTimeStackAllocator;
 	typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
 
 	vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
+	vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
 	vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
 
 	/* TODO(sergey): In theory we should be able to store references. */
@@ -918,6 +934,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			p_type[type_index].push_back(ref.prim_type());
 			p_index[type_index].push_back(ref.prim_index());
 			p_object[type_index].push_back(ref.prim_object());
+			p_time[type_index].push_back(make_float2(ref.time_from(),
+			                                         ref.time_to()));
 
 			bounds[type_index].grow(ref.bounds());
 			visibility[type_index] |= objects[ref.prim_object()]->visibility;
@@ -947,9 +965,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> local_prim_type,
 	                                local_prim_index,
 	                                local_prim_object;
+	vector<float2, LeafTimeStackAllocator> local_prim_time;
 	local_prim_type.resize(num_new_prims);
 	local_prim_index.resize(num_new_prims);
 	local_prim_object.resize(num_new_prims);
+	if(need_prim_time) {
+		local_prim_time.resize(num_new_prims);
+	}
 	for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
 		int num = (int)p_type[i].size();
 		if(num != 0) {
@@ -962,6 +984,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				local_prim_type[index] = p_type[i][j];
 				local_prim_index[index] = p_index[i][j];
 				local_prim_object[index] = p_object[i][j];
+				if(need_prim_time) {
+					local_prim_time[index] = p_time[i][j];
+				}
 				if(params.use_unaligned_nodes && !alignment_found) {
 					alignment_found =
 						unaligned_heuristic.compute_aligned_space(p_ref[i][j],
@@ -1028,11 +1053,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				prim_type.reserve(reserve);
 				prim_index.reserve(reserve);
 				prim_object.reserve(reserve);
+				if(need_prim_time) {
+					prim_time.reserve(reserve);
+				}
 			}
 
 			prim_type.resize(range_end);
 			prim_index.resize(range_end);
 			prim_object.resize(range_end);
+			if(need_prim_time) {
+				prim_time.resize(range_end);
+			}
 		}
 		spatial_spin_lock.unlock();
 
@@ -1041,6 +1072,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
+			if(need_prim_time) {
+				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
+			}
 		}
 	}
 	else {
@@ -1053,6 +1087,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
+			if(need_prim_time) {
+				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
+			}
 		}
 	}
 
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index ee3cde66a2f..430efc3e0f6 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -48,6 +48,7 @@ public:
 	         array<int>& prim_type,
 	         array<int>& prim_index,
 	         array<int>& prim_object,
+	         array<float2>& prim_time,
 	         const BVHParams& params,
 	         Progress& progress);
 	~BVHBuild();
@@ -112,6 +113,9 @@ protected:
 	array<int>& prim_type;
 	array<int>& prim_index;
 	array<int>& prim_object;
+	array<float2>& prim_time;
+
+	bool need_prim_time;
 
 	/* Build parameters. */
 	BVHParams params;
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 65f9da1c194..7b309504728 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -104,6 +104,7 @@ public:
 		primitive_mask = PRIMITIVE_ALL;
 
 		num_motion_curve_steps = 0;
+		num_motion_triangle_steps = 0;
 	}
 
 	/* SAH costs */
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 966ff5e52ba..a2373451696 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -3,6 +3,7 @@ set(INC
 	.
 	../graph
 	../kernel
+	../kernel/split
 	../kernel/svm
 	../kernel/osl
 	../util
@@ -33,6 +34,7 @@ set(SRC
 	device_cuda.cpp
 	device_multi.cpp
 	device_opencl.cpp
+	device_split_kernel.cpp
 	device_task.cpp
 )
 
@@ -56,6 +58,7 @@ set(SRC_HEADERS
 	device_memory.h
 	device_intern.h
 	device_network.h
+	device_split_kernel.h
 	device_task.h
 )
 
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 31c99f49d6d..6b07b9d04bd 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -80,7 +80,7 @@ Device::~Device()
 
 void Device::pixels_alloc(device_memory& mem)
 {
-	mem_alloc(mem, MEM_READ_WRITE);
+	mem_alloc("pixels", mem, MEM_READ_WRITE);
 }
 
 void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ccee25ae34e..c740cada98b 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -234,7 +234,7 @@ public:
 	Stats &stats;
 
 	/* regular memory */
-	virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
+	virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0;
 	virtual void mem_copy_to(device_memory& mem) = 0;
 	virtual void mem_copy_from(device_memory& mem,
 		int y, int w, int h, int elem) = 0;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c8e001ec2fd..06a1568b4d6 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -26,10 +26,12 @@
 
 #include "device.h"
 #include "device_intern.h"
+#include "device_split_kernel.h"
 
 #include "kernel.h"
 #include "kernel_compat_cpu.h"
 #include "kernel_types.h"
+#include "split/kernel_split_data.h"
 #include "kernel_globals.h"
 
 #include "osl_shader.h"
@@ -41,6 +43,7 @@
 #include "util_foreach.h"
 #include "util_function.h"
 #include "util_logging.h"
+#include "util_map.h"
 #include "util_opengl.h"
 #include "util_progress.h"
 #include "util_system.h"
@@ -48,8 +51,93 @@
 
 CCL_NAMESPACE_BEGIN
 
+class CPUDevice;
+
+class CPUSplitKernel : public DeviceSplitKernel {
+	CPUDevice *device;
+public:
+	explicit CPUSplitKernel(CPUDevice *device);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+	virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+
 class CPUDevice : public Device
 {
+	static unordered_map<string, void*> kernel_functions;
+
+	static void register_kernel_function(const char* name, void* func)
+	{
+		kernel_functions[name] = func;
+	}
+
+	static const char* get_arch_name()
+	{
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		if(system_cpu_support_avx2()) {
+			return "cpu_avx2";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		if(system_cpu_support_avx()) {
+			return "cpu_avx";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		if(system_cpu_support_sse41()) {
+			return "cpu_sse41";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+		if(system_cpu_support_sse3()) {
+			return "cpu_sse3";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		if(system_cpu_support_sse2()) {
+			return "cpu_sse2";
+		}
+		else
+#endif
+		{
+			return "cpu";
+		}
+	}
+
+	template<typename F>
+	static F get_kernel_function(string name)
+	{
+		name = string("kernel_") + get_arch_name() + "_" + name;
+
+		unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+
+		if(it == kernel_functions.end()) {
+			assert(!"kernel function not found");
+			return NULL;
+		}
+
+		return (F)it->second;
+	}
+
+	friend class CPUSplitKernel;
+
 public:
 	TaskPool task_pool;
 	KernelGlobals kernel_globals;
@@ -57,10 +145,15 @@ public:
 #ifdef WITH_OSL
 	OSLGlobals osl_globals;
 #endif
+
+	bool use_split_kernel;
+
+	DeviceRequestedFeatures requested_features;
 	
 	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
 	: Device(info, stats, background)
 	{
+
 #ifdef WITH_OSL
 		kernel_globals.osl = &osl_globals;
 #endif
@@ -105,6 +198,28 @@ public:
 		{
 			VLOG(1) << "Will be using regular kernels.";
 		}
+
+		use_split_kernel = DebugFlags().cpu.split_kernel;
+		if(use_split_kernel) {
+			VLOG(1) << "Will be using split kernel.";
+		}
+
+		kernel_cpu_register_functions(register_kernel_function);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		kernel_cpu_sse2_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+		kernel_cpu_sse3_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		kernel_cpu_sse41_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		kernel_cpu_avx_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		kernel_cpu_avx2_register_functions(register_kernel_function);
+#endif
 	}
 
 	~CPUDevice()
@@ -117,9 +232,20 @@ public:
 		return (TaskScheduler::num_threads() == 1);
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		mem.device_pointer = mem.data_pointer;
+
+		if(!mem.device_pointer) {
+			mem.device_pointer = (device_ptr)malloc(mem.memory_size());
+		}
+
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
@@ -144,6 +270,10 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
+			if(!mem.data_pointer) {
+				free((void*)mem.device_pointer);
+			}
+
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
@@ -196,8 +326,14 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE)
-			thread_path_trace(*task);
+		if(task->type == DeviceTask::PATH_TRACE) {
+			if(!use_split_kernel) {
+				thread_path_trace(*task);
+			}
+			else {
+				thread_path_trace_split(*task);
+			}
+		}
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
 		else if(task->type == DeviceTask::SHADER)
@@ -258,7 +394,7 @@ public:
 		{
 			path_trace_kernel = kernel_cpu_path_trace;
 		}
-		
+
 		while(task.acquire_tile(this, tile)) {
 			float *render_buffer = (float*)tile.buffer;
 			uint *rng_state = (uint*)tile.rng_state;
@@ -294,6 +430,49 @@ public:
 		thread_kernel_globals_free(&kg);
 	}
 
+	void thread_path_trace_split(DeviceTask& task)
+	{
+		if(task_pool.canceled()) {
+			if(task.need_finish_queue == false)
+				return;
+		}
+
+		RenderTile tile;
+
+		CPUSplitKernel split_kernel(this);
+
+		/* allocate buffer for kernel globals */
+		device_memory kgbuffer;
+		kgbuffer.resize(sizeof(KernelGlobals));
+		mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
+
+		KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
+		*kg = thread_kernel_globals_init();
+
+		requested_features.max_closure = MAX_CLOSURE;
+		if(!split_kernel.load_kernels(requested_features)) {
+			thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+			mem_free(kgbuffer);
+
+			return;
+		}
+
+		while(task.acquire_tile(this, tile)) {
+			device_memory data;
+			split_kernel.path_trace(&task, tile, kgbuffer, data);
+
+			task.release_tile(tile);
+
+			if(task_pool.canceled()) {
+				if(task.need_finish_queue == false)
+					break;
+			}
+		}
+
+		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+		mem_free(kgbuffer);
+	}
+
 	void thread_film_convert(DeviceTask& task)
 	{
 		float sample_scale = 1.0f/(task.sample + 1);
@@ -501,6 +680,10 @@ protected:
 
 	inline void thread_kernel_globals_free(KernelGlobals *kg)
 	{
+		if(kg == NULL) {
+			return;
+		}
+
 		if(kg->transparent_shadow_intersections != NULL) {
 			free(kg->transparent_shadow_intersections);
 		}
@@ -515,8 +698,176 @@ protected:
 		OSLShader::thread_free(kg);
 #endif
 	}
+
+	virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
+		requested_features = requested_features_;
+
+		return true;
+	}
+};
+
+/* split kernel */
+
+class CPUSplitKernelFunction : public SplitKernelFunction {
+public:
+	CPUDevice* device;
+	void (*func)(KernelGlobals *kg, KernelData *data);
+
+	CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
+	~CPUSplitKernelFunction() {}
+
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
+	{
+		if(!func) {
+			return false;
+		}
+
+		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+		for(int y = 0; y < dim.global_size[1]; y++) {
+			for(int x = 0; x < dim.global_size[0]; x++) {
+				kg->global_id = make_int2(x, y);
+
+				func(kg, (KernelData*)data.device_pointer);
+			}
+		}
+
+		return true;
+	}
 };
 
+CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                                    RenderTile& rtile,
+                                                    int num_global_elements,
+                                                    device_memory& kernel_globals,
+                                                    device_memory& data,
+                                                    device_memory& split_data,
+                                                    device_memory& ray_state,
+                                                    device_memory& queue_index,
+                                                    device_memory& use_queues_flags,
+                                                    device_memory& work_pool_wgs)
+{
+	typedef void(*data_init_t)(KernelGlobals *kg,
+	                           ccl_constant KernelData *data,
+	                           ccl_global void *split_data_buffer,
+	                           int num_elements,
+	                           ccl_global char *ray_state,
+	                           ccl_global uint *rng_state,
+	                           int start_sample,
+	                           int end_sample,
+	                           int sx, int sy, int sw, int sh, int offset, int stride,
+	                           ccl_global int *Queue_index,
+	                           int queuesize,
+	                           ccl_global char *use_queues_flag,
+	                           ccl_global unsigned int *work_pool_wgs,
+	                           unsigned int num_samples,
+	                           ccl_global float *buffer);
+
+	data_init_t data_init;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+	if(system_cpu_support_avx2()) {
+		data_init = kernel_cpu_avx2_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+	if(system_cpu_support_avx()) {
+		data_init = kernel_cpu_avx_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+	if(system_cpu_support_sse41()) {
+		data_init = kernel_cpu_sse41_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+	if(system_cpu_support_sse3()) {
+		data_init = kernel_cpu_sse3_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+	if(system_cpu_support_sse2()) {
+		data_init = kernel_cpu_sse2_data_init;
+	}
+	else
+#endif
+	{
+		data_init = kernel_cpu_data_init;
+	}
+
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+	for(int y = 0; y < dim.global_size[1]; y++) {
+		for(int x = 0; x < dim.global_size[0]; x++) {
+			kg->global_id = make_int2(x, y);
+
+			data_init((KernelGlobals*)kernel_globals.device_pointer,
+			          (KernelData*)data.device_pointer,
+			          (void*)split_data.device_pointer,
+			          num_global_elements,
+			          (char*)ray_state.device_pointer,
+			          (uint*)rtile.rng_state,
+			          rtile.start_sample,
+			          rtile.start_sample + rtile.num_samples,
+			          rtile.x,
+			          rtile.y,
+			          rtile.w,
+			          rtile.h,
+			          rtile.offset,
+			          rtile.stride,
+			          (int*)queue_index.device_pointer,
+			          dim.global_size[0] * dim.global_size[1],
+			          (char*)use_queues_flags.device_pointer,
+			          (uint*)work_pool_wgs.device_pointer,
+			          rtile.num_samples,
+			          (float*)rtile.buffer);
+		}
+	}
+
+	return true;
+}
+
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
+
+	kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+	if(!kernel->func) {
+		delete kernel;
+		return NULL;
+	}
+
+	return kernel;
+}
+
+int2 CPUSplitKernel::split_kernel_local_size()
+{
+	return make_int2(1, 1);
+}
+
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask *task) {
+	/* TODO(mai): this needs investigation but cpu gives incorrect render if global size doesnt match tile size */
+	return task->requested_tile_size;
+}
+
+size_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+	return split_data_buffer_size(kg, num_threads);
+}
+
+unordered_map<string, void*> CPUDevice::kernel_functions;
+
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dafac6dfcb3..a630a3d1183 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -15,12 +15,14 @@
  */
 
 #include <climits>
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "device.h"
 #include "device_intern.h"
+#include "device_split_kernel.h"
 
 #include "buffers.h"
 
@@ -42,6 +44,8 @@
 #include "util_types.h"
 #include "util_time.h"
 
+#include "split/kernel_split_data_types.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifndef WITH_CUDA_DYNLOAD
@@ -78,6 +82,31 @@ int cuewCompilerVersion(void)
 }  /* namespace */
 #endif  /* WITH_CUDA_DYNLOAD */
 
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+	CUDADevice *device;
+public:
+	explicit CUDASplitKernel(CUDADevice *device);
+
+	virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+};
+
 class CUDADevice : public Device
 {
 public:
@@ -258,11 +287,16 @@ public:
 		return DebugFlags().cuda.adaptive_compile;
 	}
 
+	bool use_split_kernel()
+	{
+		return DebugFlags().cuda.split_kernel;
+	}
+
 	/* Common NVCC flags which stays the same regardless of shading model,
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features)
+	        const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
@@ -287,6 +321,11 @@ public:
 #ifdef WITH_CYCLES_DEBUG
 		cflags += " -D__KERNEL_DEBUG__";
 #endif
+
+		if(split) {
+			cflags += " -D__SPLIT__";
+		}
+
 		return cflags;
 	}
 
@@ -320,7 +359,7 @@ public:
 		return true;
 	}
 
-	string compile_kernel(const DeviceRequestedFeatures& requested_features)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		/* Compute cubin name. */
 		int major, minor;
@@ -329,7 +368,8 @@ public:
 
 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
+			                                                  : "lib/kernel_sm_%d%d.cubin",
 			                                            major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
@@ -339,7 +379,7 @@ public:
 		}
 
 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features);
+		        compile_kernel_get_common_cflags(requested_features, split);
 
 		/* Try to use locally compiled kernel. */
 		const string kernel_path = path_get("kernel");
@@ -350,7 +390,8 @@ public:
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
 
-		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
+		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
+		                                              : "cycles_kernel_sm%d%d_%s.cubin",
 		                                        major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
@@ -385,7 +426,7 @@ public:
 		const char *nvcc = cuewCompilerPath();
 		const string kernel = path_join(kernel_path,
 		                          path_join("kernels",
-		                                    path_join("cuda", "kernel.cu")));
+		                                    path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
 
@@ -433,7 +474,7 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(requested_features);
+		string cubin = compile_kernel(requested_features, use_split_kernel());
 
 		if(cubin == "")
 			return false;
@@ -466,8 +507,14 @@ public:
 		}
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		cuda_push_context();
 		CUdeviceptr device_pointer;
 		size_t size = mem.memory_size();
@@ -504,7 +551,9 @@ public:
 
 	void mem_zero(device_memory& mem)
 	{
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
+		if(mem.data_pointer) {
+			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		}
 
 		cuda_push_context();
 		if(mem.device_pointer)
@@ -617,7 +666,7 @@ public:
 		/* Data Storage */
 		if(interpolation == INTERPOLATION_NONE) {
 			if(has_bindless_textures) {
-				mem_alloc(mem, MEM_READ_ONLY);
+				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
 				cuda_push_context();
@@ -641,7 +690,7 @@ public:
 				cuda_pop_context();
 			}
 			else {
-				mem_alloc(mem, MEM_READ_ONLY);
+				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
 				cuda_push_context();
@@ -1258,25 +1307,48 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
 
-			/* keep rendering tiles until done */
-			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+			if(!use_split_kernel()) {
+				/* keep rendering tiles until done */
+				while(task->acquire_tile(this, tile)) {
+					int start_sample = tile.start_sample;
+					int end_sample = tile.start_sample + tile.num_samples;
 
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
-					}
+					for(int sample = start_sample; sample < end_sample; sample++) {
+						if(task->get_cancel()) {
+							if(task->need_finish_queue == false)
+								break;
+						}
 
-					path_trace(tile, sample, branched);
+						path_trace(tile, sample, branched);
 
-					tile.sample = sample + 1;
+						tile.sample = sample + 1;
 
-					task->update_progress(&tile, tile.w*tile.h);
+						task->update_progress(&tile, tile.w*tile.h);
+					}
+
+					task->release_tile(tile);
+				}
+			}
+			else {
+				DeviceRequestedFeatures requested_features;
+				if(!use_adaptive_compilation()) {
+					requested_features.max_closure = 64;
 				}
 
-				task->release_tile(tile);
+				CUDASplitKernel split_kernel(this);
+				split_kernel.load_kernels(requested_features);
+
+				while(task->acquire_tile(this, tile)) {
+					device_memory void_buffer;
+					split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+
+					task->release_tile(tile);
+
+					if(task->get_cancel()) {
+						if(task->need_finish_queue == false)
+							break;
+					}
+				}
 			}
 		}
 		else if(task->type == DeviceTask::SHADER) {
@@ -1329,8 +1401,223 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+	friend class CUDASplitKernelFunction;
+	friend class CUDASplitKernel;
 };
 
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+	{ \
+		CUresult result = stmt; \
+		\
+		if(result != CUDA_SUCCESS) { \
+			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			if(device->error_msg == "") \
+				device->error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+			/*cuda_abort();*/ \
+			device->cuda_error_documentation(); \
+		} \
+	} (void)0
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+	CUDADevice* device;
+	CUfunction func;
+public:
+	CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+	{
+		return enqueue(dim, NULL);
+	}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, void *args[])
+	{
+		device->cuda_push_context();
+
+		if(device->have_error())
+			return false;
+
+		/* we ignore dim.local_size for now, as this is faster */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
+
+		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+
+		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+		cuda_assert(cuLaunchKernel(func,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, args, 0));
+
+		device->cuda_pop_context();
+
+		return !device->have_error();
+	}
+};
+
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+size_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+	device_vector<uint> size_buffer;
+	size_buffer.resize(1);
+	device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+	device->cuda_push_context();
+
+	uint threads = num_threads;
+	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+	struct args_t {
+		uint* num_threads;
+		CUdeviceptr* size;
+	};
+
+	args_t args = {
+		&threads,
+		&d_size
+	};
+
+	CUfunction state_buffer_size;
+	cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+	cuda_assert(cuLaunchKernel(state_buffer_size,
+	                           1, 1, 1,
+	                           1, 1, 1,
+	                           0, 0, &args, 0));
+
+	device->cuda_pop_context();
+
+	device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+	device->mem_free(size_buffer);
+
+	return *size_buffer.get_data();
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                    RenderTile& rtile,
+                                    int num_global_elements,
+                                    device_memory& /*kernel_globals*/,
+                                    device_memory& /*kernel_data*/,
+                                    device_memory& split_data,
+                                    device_memory& ray_state,
+                                    device_memory& queue_index,
+                                    device_memory& use_queues_flag,
+                                    device_memory& work_pool_wgs)
+{
+	device->cuda_push_context();
+
+	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+	CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+	CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+	CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+	CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
+	CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+	int end_sample = rtile.start_sample + rtile.num_samples;
+	int queue_size = dim.global_size[0] * dim.global_size[1];
+
+	struct args_t {
+		CUdeviceptr* split_data_buffer;
+		int* num_elements;
+		CUdeviceptr* ray_state;
+		CUdeviceptr* rng_state;
+		int* start_sample;
+		int* end_sample;
+		int* sx;
+		int* sy;
+		int* sw;
+		int* sh;
+		int* offset;
+		int* stride;
+		CUdeviceptr* queue_index;
+		int* queuesize;
+		CUdeviceptr* use_queues_flag;
+		CUdeviceptr* work_pool_wgs;
+		int* num_samples;
+		CUdeviceptr* buffer;
+	};
+
+	args_t args = {
+		&d_split_data,
+		&num_global_elements,
+		&d_ray_state,
+		&d_rng_state,
+		&rtile.start_sample,
+		&end_sample,
+		&rtile.x,
+		&rtile.y,
+		&rtile.w,
+		&rtile.h,
+		&rtile.offset,
+		&rtile.stride,
+		&d_queue_index,
+		&queue_size,
+		&d_use_queues_flag,
+		&d_work_pool_wgs,
+		&rtile.num_samples,
+		&d_buffer
+	};
+
+	CUfunction data_init;
+	cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+	if(device->have_error()) {
+		return false;
+	}
+
+	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+	device->cuda_pop_context();
+
+	return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CUfunction func;
+
+	device->cuda_push_context();
+
+	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+	if(device->have_error()) {
+		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+		return NULL;
+	}
+
+	device->cuda_pop_context();
+
+	return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+	return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
+{
+	/* TODO(mai): implement something here to detect ideal work size */
+	return make_int2(256, 256);
+}
+
 bool device_cuda_init(void)
 {
 #ifdef WITH_CUDA_DYNLOAD
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b5b4dc6802..b69c3dad604 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -180,10 +180,27 @@ public:
 	/* device pointer */
 	device_ptr device_pointer;
 
-protected:
-	device_memory() {}
+	device_memory()
+	{
+		data_type = device_type_traits<uchar>::data_type;
+		data_elements = device_type_traits<uchar>::num_elements;
+		data_pointer = 0;
+		data_size = 0;
+		device_size = 0;
+		data_width = 0;
+		data_height = 0;
+		data_depth = 0;
+		device_pointer = 0;
+	}
 	virtual ~device_memory() { assert(!device_pointer); }
 
+	void resize(size_t size)
+	{
+		data_size = size;
+		data_width = size;
+	}
+
+protected:
 	/* no copying */
 	device_memory(const device_memory&);
 	device_memory& operator = (const device_memory&);
@@ -198,16 +215,8 @@ public:
 	{
 		data_type = device_type_traits<T>::data_type;
 		data_elements = device_type_traits<T>::num_elements;
-		data_pointer = 0;
-		data_size = 0;
-		device_size = 0;
-		data_width = 0;
-		data_height = 0;
-		data_depth = 0;
 
 		assert(data_elements > 0);
-
-		device_pointer = 0;
 	}
 
 	virtual ~device_vector() {}
@@ -266,6 +275,7 @@ public:
 		data_height = 0;
 		data_depth = 0;
 		data_size = 0;
+		device_pointer = 0;
 	}
 
 	size_t size()
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 31b800640d3..3368fd3d756 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -106,11 +106,11 @@ public:
 		return true;
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
-			sub.device->mem_alloc(mem, type);
+			sub.device->mem_alloc(name, mem, type);
 			sub.ptr_map[unique_ptr] = mem.device_pointer;
 		}
 
@@ -162,6 +162,7 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
+		stats.mem_free(mem.device_size);
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -170,7 +171,6 @@ public:
 		}
 
 		mem.device_pointer = 0;
-		stats.mem_free(mem.device_size);
 	}
 
 	void const_copy_to(const char *name, void *host, size_t size)
@@ -202,6 +202,7 @@ public:
 	void tex_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
+		stats.mem_free(mem.device_size);
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -210,7 +211,6 @@ public:
 		}
 
 		mem.device_pointer = 0;
-		stats.mem_free(mem.device_size);
 	}
 
 	void pixels_alloc(device_memory& mem)
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 53eef6cf199..6dc4aecbc50 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -87,8 +87,14 @@ public:
 		snd.write();
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+				    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+				    << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		thread_scoped_lock lock(rpc_lock);
 
 		mem.device_pointer = ++mem_counter;
@@ -481,7 +487,7 @@ protected:
 				mem.data_pointer = 0;
 
 			/* perform the allocation on the actual device */
-			device->mem_alloc(mem, type);
+			device->mem_alloc(NULL, mem, type);
 
 			/* store a mapping to/from client_pointer and real device pointer */
 			pointer_mapping_insert(client_pointer, mem.device_pointer);
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
new file mode 100644
index 00000000000..b9705077fbf
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device_split_kernel.h"
+
+#include "kernel_types.h"
+#include "kernel_split_data_types.h"
+
+#include "util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+static const double alpha = 0.1; /* alpha for rolling average */
+
+DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
+{
+	current_max_closure = -1;
+	first_tile = true;
+
+	avg_time_per_sample = 0.0;
+
+	kernel_path_init = NULL;
+	kernel_scene_intersect = NULL;
+	kernel_lamp_emission = NULL;
+	kernel_queue_enqueue = NULL;
+	kernel_background_buffer_update = NULL;
+	kernel_shader_eval = NULL;
+	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
+	kernel_direct_lighting = NULL;
+	kernel_shadow_blocked = NULL;
+	kernel_next_iteration_setup = NULL;
+}
+
+DeviceSplitKernel::~DeviceSplitKernel()
+{
+	device->mem_free(split_data);
+	device->mem_free(ray_state);
+	device->mem_free(use_queues_flag);
+	device->mem_free(queue_index);
+	device->mem_free(work_pool_wgs);
+
+	delete kernel_path_init;
+	delete kernel_scene_intersect;
+	delete kernel_lamp_emission;
+	delete kernel_queue_enqueue;
+	delete kernel_background_buffer_update;
+	delete kernel_shader_eval;
+	delete kernel_holdout_emission_blurring_pathtermination_ao;
+	delete kernel_direct_lighting;
+	delete kernel_shadow_blocked;
+	delete kernel_next_iteration_setup;
+}
+
+bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
+{
+#define LOAD_KERNEL(name) \
+		kernel_##name = get_split_kernel_function(#name, requested_features); \
+		if(!kernel_##name) { \
+			return false; \
+		}
+
+	LOAD_KERNEL(path_init);
+	LOAD_KERNEL(scene_intersect);
+	LOAD_KERNEL(lamp_emission);
+	LOAD_KERNEL(queue_enqueue);
+	LOAD_KERNEL(background_buffer_update);
+	LOAD_KERNEL(shader_eval);
+	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+	LOAD_KERNEL(direct_lighting);
+	LOAD_KERNEL(shadow_blocked);
+	LOAD_KERNEL(next_iteration_setup);
+
+#undef LOAD_KERNEL
+
+	current_max_closure = requested_features.max_closure;
+
+	return true;
+}
+
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size)
+{
+	size_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+	return max_buffer_size / size_per_element;
+}
+
+bool DeviceSplitKernel::path_trace(DeviceTask *task,
+                                   RenderTile& tile,
+                                   device_memory& kgbuffer,
+                                   device_memory& kernel_data)
+{
+	if(device->have_error()) {
+		return false;
+	}
+
+	/* Get local size */
+	size_t local_size[2];
+	{
+		int2 lsize = split_kernel_local_size();
+		local_size[0] = lsize[0];
+		local_size[1] = lsize[1];
+	}
+
+	/* Set gloabl size */
+	size_t global_size[2];
+	{
+		int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+
+		/* Make sure that set work size is a multiple of local
+		 * work size dimensions.
+		 */
+		global_size[0] = round_up(gsize[0], local_size[0]);
+		global_size[1] = round_up(gsize[1], local_size[1]);
+	}
+
+	/* Number of elements in the global state buffer */
+	int num_global_elements = global_size[0] * global_size[1];
+
+	/* Allocate all required global memory once. */
+	if(first_tile) {
+		first_tile = false;
+
+		/* Calculate max groups */
+
+		/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
+		unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1;
+
+		/* Allocate work_pool_wgs memory. */
+		work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+		device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
+
+		queue_index.resize(NUM_QUEUES * sizeof(int));
+		device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
+
+		use_queues_flag.resize(sizeof(char));
+		device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
+
+		ray_state.resize(num_global_elements);
+		device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
+
+		split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
+		device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
+	}
+
+#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
+		if(device->have_error()) { \
+			return false; \
+		} \
+		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
+			return false; \
+		}
+
+	tile.sample = tile.start_sample;
+
+	/* for exponential increase between tile updates */
+	int time_multiplier = 1;
+
+	while(tile.sample < tile.start_sample + tile.num_samples) {
+		/* to keep track of how long it takes to run a number of samples */
+		double start_time = time_dt();
+
+		/* initial guess to start rolling average */
+		const int initial_num_samples = 1;
+		/* approx number of samples per second */
+		int samples_per_second = (avg_time_per_sample > 0.0) ?
+		                         int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
+
+		RenderTile subtile = tile;
+		subtile.start_sample = tile.sample;
+		subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
+
+		if(device->have_error()) {
+			return false;
+		}
+
+		/* reset state memory here as global size for data_init
+		 * kernel might not be large enough to do in kernel
+		 */
+		device->mem_zero(work_pool_wgs);
+		device->mem_zero(split_data);
+
+		if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+		                                   subtile,
+		                                   num_global_elements,
+		                                   kgbuffer,
+		                                   kernel_data,
+		                                   split_data,
+		                                   ray_state,
+		                                   queue_index,
+		                                   use_queues_flag,
+		                                   work_pool_wgs))
+		{
+			return false;
+		}
+
+		ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
+
+		bool activeRaysAvailable = true;
+
+		while(activeRaysAvailable) {
+			/* Twice the global work size of other kernels for
+			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
+			size_t global_size_shadow_blocked[2];
+			global_size_shadow_blocked[0] = global_size[0] * 2;
+			global_size_shadow_blocked[1] = global_size[1];
+
+			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
+			for(int PathIter = 0; PathIter < 16; PathIter++) {
+				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
+				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+
+				if(task->get_cancel()) {
+					return true;
+				}
+			}
+
+			/* Decide if we should exit path-iteration in host. */
+			device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
+
+			activeRaysAvailable = false;
+
+			for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
+				if(int8_t(ray_state.get_data()[rayStateIter]) != RAY_INACTIVE) {
+					/* Not all rays are RAY_INACTIVE. */
+					activeRaysAvailable = true;
+					break;
+				}
+			}
+
+			if(task->get_cancel()) {
+				return true;
+			}
+		}
+
+		double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
+
+		if(avg_time_per_sample == 0.0) {
+			/* start rolling average */
+			avg_time_per_sample = time_per_sample;
+		}
+		else {
+			avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
+		}
+
+#undef ENQUEUE_SPLIT_KERNEL
+
+		tile.sample += subtile.num_samples;
+		task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
+
+		time_multiplier = min(time_multiplier << 1, 10);
+
+		if(task->get_cancel()) {
+			return true;
+		}
+	}
+
+	return true;
+}
+
+CCL_NAMESPACE_END
+
+
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
new file mode 100644
index 00000000000..cc3e1aa26ae
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_SPLIT_KERNEL_H__
+#define __DEVICE_SPLIT_KERNEL_H__
+
+#include "device.h"
+#include "buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+/* Types used for split kernel */
+
+class KernelDimensions {
+public:
+	size_t global_size[2];
+	size_t local_size[2];
+
+	KernelDimensions(size_t global_size_[2], size_t local_size_[2])
+	{
+		memcpy(global_size, global_size_, sizeof(global_size));
+		memcpy(local_size, local_size_, sizeof(local_size));
+	}
+};
+
+class SplitKernelFunction {
+public:
+	virtual ~SplitKernelFunction() {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
+};
+
+class DeviceSplitKernel {
+private:
+	Device *device;
+
+	SplitKernelFunction *kernel_path_init;
+	SplitKernelFunction *kernel_scene_intersect;
+	SplitKernelFunction *kernel_lamp_emission;
+	SplitKernelFunction *kernel_queue_enqueue;
+	SplitKernelFunction *kernel_background_buffer_update;
+	SplitKernelFunction *kernel_shader_eval;
+	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
+	SplitKernelFunction *kernel_direct_lighting;
+	SplitKernelFunction *kernel_shadow_blocked;
+	SplitKernelFunction *kernel_next_iteration_setup;
+
+	/* Global memory variables [porting]; These memory is used for
+	 * co-operation between different kernels; Data written by one
+	 * kernel will be available to another kernel via this global
+	 * memory.
+	 */
+	device_memory split_data;
+	device_vector<uchar> ray_state;
+	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+
+	/* Flag to make sceneintersect and lampemission kernel use queues. */
+	device_memory use_queues_flag;
+
+	/* Approximate time it takes to complete one sample */
+	double avg_time_per_sample;
+
+	/* Work pool with respect to each work group. */
+	device_memory work_pool_wgs;
+
+	/* clos_max value for which the kernels have been loaded currently. */
+	int current_max_closure;
+
+	/* Marked True in constructor and marked false at the end of path_trace(). */
+	bool first_tile;
+
+public:
+	explicit DeviceSplitKernel(Device* device);
+	virtual ~DeviceSplitKernel();
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+	bool path_trace(DeviceTask *task,
+	                RenderTile& rtile,
+	                device_memory& kgbuffer,
+	                device_memory& kernel_data);
+
+	virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+	size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, size_t max_buffer_size);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs) = 0;
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
+	virtual int2 split_kernel_local_size() = 0;
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_SPLIT_KERNEL_H__ */
+
+
+
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 8bd54c3d2b0..f31092fd9d2 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -51,6 +51,8 @@ public:
 	int shader_filter;
 	int shader_x, shader_w;
 
+	int passes_size;
+
 	explicit DeviceTask(Type type = PATH_TRACE);
 
 	int get_subtask_count(int num, int max_size = 0);
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 4023ba89a10..6470cb8ff7e 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -26,29 +26,29 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-/* Macro declarations used with split kernel */
-
-/* Macro to enable/disable work-stealing */
-#define __WORK_STEALING__
-
-#define SPLIT_KERNEL_LOCAL_SIZE_X 64
-#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
-
-/* This value may be tuned according to the scene we are rendering.
- *
- * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
- * ray-bounces will improve performance.
- */
-#define PATH_ITER_INC_FACTOR 8
+/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
+#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
+/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
+#  undef clEnqueueNDRangeKernel
+#  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+
+#  undef clEnqueueWriteBuffer
+#  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+
+#  undef clEnqueueReadBuffer
+#  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+#endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
 
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
 
 struct OpenCLPlatformDevice {
 	OpenCLPlatformDevice(cl_platform_id platform_id,
@@ -248,6 +248,7 @@ public:
 
 	bool device_initialized;
 	string platform_name;
+	string device_name;
 
 	bool opencl_error(cl_int err);
 	void opencl_error(const string& message);
@@ -266,10 +267,10 @@ public:
 
 	/* Has to be implemented by the real device classes.
 	 * The base device will then load all these programs. */
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
 	                          vector<OpenCLProgram*> &programs) = 0;
 
-	void mem_alloc(device_memory& mem, MemoryType type);
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type);
 	void mem_copy_to(device_memory& mem);
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
@@ -326,16 +327,39 @@ protected:
 
 	class ArgumentWrapper {
 	public:
-		ArgumentWrapper() : size(0), pointer(NULL) {}
-		template <typename T>
+		ArgumentWrapper() : size(0), pointer(NULL)
+		{
+		}
+
+		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
+		                                           pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
+		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
+		                                              pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
 		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument) { }
+		                               pointer(&argument)
+		{
+		}
+
 		ArgumentWrapper(int argument) : size(sizeof(int)),
 		                                int_value(argument),
-		                                pointer(&int_value) { }
+		                                pointer(&int_value)
+		{
+		}
+
 		ArgumentWrapper(float argument) : size(sizeof(float)),
 		                                  float_value(argument),
-		                                  pointer(&float_value) { }
+		                                  pointer(&float_value)
+		{
+		}
+
 		size_t size;
 		int int_value;
 		float float_value;
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index a2b900312e7..c5f44f84e8c 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -82,9 +82,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cpPlatform = platform_device.platform_id;
 	cdDevice = platform_device.device_id;
 	platform_name = platform_device.platform_name;
+	device_name = platform_device.device_name;
 	VLOG(2) << "Creating new Cycles device for OpenCL platform "
 	        << platform_name << ", device "
-	        << platform_device.device_name << ".";
+	        << device_name << ".";
 
 	{
 		/* try to use cached context */
@@ -113,12 +114,16 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	}
 
 	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating command queue");
 		return;
+	}
 
 	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating memory buffer for NULL");
 		return;
+	}
 
 	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
@@ -191,6 +196,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
 
 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
 {
+	VLOG(2) << "Loading kernels for platform " << platform_name
+	        << ", device " << device_name << ".";
 	/* Verify if device was initialized. */
 	if(!device_initialized) {
 		fprintf(stderr, "OpenCL: failed to initialize device.\n");
@@ -206,11 +213,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program.add_kernel(ustring("convert_to_half_float"));
 	base_program.add_kernel(ustring("shader"));
 	base_program.add_kernel(ustring("bake"));
+	base_program.add_kernel(ustring("zero_buffer"));
 
 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
 	/* Call actual class to fill the vector with its programs. */
-	load_kernels(requested_features, programs);
+	if(!load_kernels(requested_features, programs)) {
+		return false;
+	}
 
 	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
 	 * serialize the calls internally, so it's not much use right now.
@@ -242,8 +252,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	return true;
 }
 
-void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
+void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryType type)
 {
+	if(name) {
+		VLOG(1) << "Buffer allocate: " << name << ", "
+			    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			    << string_human_readable_size(mem.memory_size()) << ")";
+	}
+
 	size_t size = mem.memory_size();
 
 	cl_mem_flags mem_flag;
@@ -311,8 +327,61 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 void OpenCLDeviceBase::mem_zero(device_memory& mem)
 {
 	if(mem.device_pointer) {
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
-		mem_copy_to(mem);
+		if(base_program.is_loaded()) {
+			cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+			size_t global_size[] = {1024, 1024};
+			size_t num_threads = global_size[0] * global_size[1];
+
+			cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
+			unsigned long long d_offset = 0;
+			unsigned long long d_size = 0;
+
+			while(d_offset < mem.memory_size()) {
+				d_size = std::min<unsigned long long>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+
+				kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+				ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+				                               ckZeroBuffer,
+				                               2,
+				                               NULL,
+				                               global_size,
+				                               NULL,
+				                               0,
+				                               NULL,
+				                               NULL);
+				opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+				d_offset += d_size;
+			}
+		}
+
+		if(mem.data_pointer) {
+			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		}
+
+		if(!base_program.is_loaded()) {
+			void* zero = (void*)mem.data_pointer;
+
+			if(!mem.data_pointer) {
+				zero = util_aligned_malloc(mem.memory_size(), 16);
+				memset(zero, 0, mem.memory_size());
+			}
+
+			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+			                                   CL_MEM_PTR(mem.device_pointer),
+			                                   CL_TRUE,
+			                                   0,
+			                                   mem.memory_size(),
+			                                   zero,
+			                                   0,
+			                                   NULL, NULL));
+
+			if(!mem.data_pointer) {
+				util_aligned_free(zero);
+			}
+		}
 	}
 }
 
@@ -337,7 +406,7 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 		device_vector<uchar> *data = new device_vector<uchar>();
 		data->copy((uchar*)host, size);
 
-		mem_alloc(*data, MEM_READ_ONLY);
+		mem_alloc(name, *data, MEM_READ_ONLY);
 		i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
 	}
 	else {
@@ -356,7 +425,7 @@ void OpenCLDeviceBase::tex_alloc(const char *name,
 	VLOG(1) << "Texture allocate: " << name << ", "
 	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 	        << string_human_readable_size(mem.memory_size()) << ")";
-	mem_alloc(mem, MEM_READ_ONLY);
+	mem_alloc(NULL, mem, MEM_READ_ONLY);
 	mem_copy_to(mem);
 	assert(mem_map.find(name) == mem_map.end());
 	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 6ea7619e022..049e332272b 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -43,11 +43,12 @@ public:
 		return true;
 	}
 
-	virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
 	                          vector<OpenCLProgram*> &programs)
 	{
 		path_trace_program.add_kernel(ustring("path_trace"));
 		programs.push_back(&path_trace_program);
+		return true;
 	}
 
 	~OpenCLDeviceMegaKernel()
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 3c3c2150128..b651b4a848e 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -21,325 +21,48 @@
 #include "buffers.h"
 
 #include "kernel_types.h"
+#include "kernel_split_data_types.h"
 
+#include "device_split_kernel.h"
+
+#include "util_logging.h"
 #include "util_md5.h"
 #include "util_path.h"
 #include "util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* TODO(sergey): This is to keep tile split on OpenCL level working
- * for now, since without this view-port render does not work as it
- * should.
- *
- * Ideally it'll be done on the higher level, but we need to get ready
- * for merge rather soon, so let's keep split logic private here in
- * the file.
- */
-class SplitRenderTile : public RenderTile {
-public:
-	SplitRenderTile()
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0) {}
-
-	explicit SplitRenderTile(RenderTile& tile)
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0)
-	{
-		x = tile.x;
-		y = tile.y;
-		w = tile.w;
-		h = tile.h;
-		start_sample = tile.start_sample;
-		num_samples = tile.num_samples;
-		sample = tile.sample;
-		resolution = tile.resolution;
-		offset = tile.offset;
-		stride = tile.stride;
-		buffer = tile.buffer;
-		rng_state = tile.rng_state;
-		buffers = tile.buffers;
+class OpenCLSplitKernel;
+
+static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
+{
+	string build_options = "-D__SPLIT_KERNEL__ ";
+	build_options += requested_features.get_build_options();
+
+	/* Set compute device build option. */
+	cl_device_type device_type;
+	device->ciErr = clGetDeviceInfo(device->cdDevice,
+	                        CL_DEVICE_TYPE,
+	                        sizeof(cl_device_type),
+	                        &device_type,
+	                        NULL);
+	assert(device->ciErr == CL_SUCCESS);
+	if(device_type == CL_DEVICE_TYPE_GPU) {
+		build_options += " -D__COMPUTE_DEVICE_GPU__";
 	}
 
-	/* Split kernel is device global memory constrained;
-	 * hence split kernel cant render big tile size's in
-	 * one go. If the user sets a big tile size (big tile size
-	 * is a term relative to the available device global memory),
-	 * we split the tile further and then call path_trace on
-	 * each of those split tiles. The following variables declared,
-	 * assist in achieving that purpose
-	 */
-	int buffer_offset_x;
-	int buffer_offset_y;
-	int rng_state_offset_x;
-	int rng_state_offset_y;
-	int buffer_rng_state_stride;
-};
+	return build_options;
+}
 
 /* OpenCLDeviceSplitKernel's declaration/definition. */
 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
 {
 public:
-	/* Kernel declaration. */
+	DeviceSplitKernel *split_kernel;
 	OpenCLProgram program_data_init;
-	OpenCLProgram program_scene_intersect;
-	OpenCLProgram program_lamp_emission;
-	OpenCLProgram program_queue_enqueue;
-	OpenCLProgram program_background_buffer_update;
-	OpenCLProgram program_shader_eval;
-	OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-	OpenCLProgram program_direct_lighting;
-	OpenCLProgram program_shadow_blocked;
-	OpenCLProgram program_next_iteration_setup;
-	OpenCLProgram program_sum_all_radiance;
-
-	/* Global memory variables [porting]; These memory is used for
-	 * co-operation between different kernels; Data written by one
-	 * kernel will be available to another kernel via this global
-	 * memory.
-	 */
-	cl_mem rng_coop;
-	cl_mem throughput_coop;
-	cl_mem L_transparent_coop;
-	cl_mem PathRadiance_coop;
-	cl_mem Ray_coop;
-	cl_mem PathState_coop;
-	cl_mem Intersection_coop;
-	cl_mem kgbuffer;  /* KernelGlobals buffer. */
-
-	/* Global buffers for ShaderData. */
-	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
-	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
-	                        * shadow_blocked kernel.
-	                        */
-
-	/* Global memory required for shadow blocked and accum_radiance. */
-	cl_mem BSDFEval_coop;
-	cl_mem ISLamp_coop;
-	cl_mem LightRay_coop;
-	cl_mem AOAlpha_coop;
-	cl_mem AOBSDF_coop;
-	cl_mem AOLightRay_coop;
-	cl_mem Intersection_coop_shadow;
-
-#ifdef WITH_CYCLES_DEBUG
-	/* DebugData memory */
-	cl_mem debugdata_coop;
-#endif
-
-	/* Global state array that tracks ray state. */
-	cl_mem ray_state;
-
-	/* Per sample buffers. */
-	cl_mem per_sample_output_buffers;
-
-	/* Denotes which sample each ray is being processed for. */
-	cl_mem work_array;
-
-	/* Queue */
-	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
-	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
-	                     * Tracks the size of each queue.
-	                     */
-
-	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	cl_mem use_queues_flag;
-
-	/* Amount of memory in output buffer associated with one pixel/thread. */
-	size_t per_thread_output_buffer_size;
-
-	/* Total allocatable available device memory. */
-	size_t total_allocatable_memory;
-
-	/* host version of ray_state; Used in checking host path-iteration
-	 * termination.
-	 */
-	char *hostRayStateArray;
-
-	/* Number of path-iterations to be done in one shot. */
-	unsigned int PathIteration_times;
-
-#ifdef __WORK_STEALING__
-	/* Work pool with respect to each work group. */
-	cl_mem work_pool_wgs;
-
-	/* Denotes the maximum work groups possible w.r.t. current tile size. */
-	unsigned int max_work_groups;
-#endif
-
-	/* clos_max value for which the kernels have been loaded currently. */
-	int current_max_closure;
-
-	/* Marked True in constructor and marked false at the end of path_trace(). */
-	bool first_tile;
-
-	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
-	: OpenCLDeviceBase(info, stats, background_)
-	{
-		background = background_;
-
-		/* Initialize cl_mem variables. */
-		kgbuffer = NULL;
-		sd = NULL;
-		sd_DL_shadow = NULL;
-
-		rng_coop = NULL;
-		throughput_coop = NULL;
-		L_transparent_coop = NULL;
-		PathRadiance_coop = NULL;
-		Ray_coop = NULL;
-		PathState_coop = NULL;
-		Intersection_coop = NULL;
-		ray_state = NULL;
-
-		AOAlpha_coop = NULL;
-		AOBSDF_coop = NULL;
-		AOLightRay_coop = NULL;
-		BSDFEval_coop = NULL;
-		ISLamp_coop = NULL;
-		LightRay_coop = NULL;
-		Intersection_coop_shadow = NULL;
-
-#ifdef WITH_CYCLES_DEBUG
-		debugdata_coop = NULL;
-#endif
-
-		work_array = NULL;
-
-		/* Queue. */
-		Queue_data = NULL;
-		Queue_index = NULL;
-		use_queues_flag = NULL;
-
-		per_sample_output_buffers = NULL;
-
-		per_thread_output_buffer_size = 0;
-		hostRayStateArray = NULL;
-		PathIteration_times = PATH_ITER_INC_FACTOR;
-#ifdef __WORK_STEALING__
-		work_pool_wgs = NULL;
-		max_work_groups = 0;
-#endif
-		current_max_closure = -1;
-		first_tile = true;
-
-		/* Get device's maximum memory that can be allocated. */
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-		                        sizeof(size_t),
-		                        &total_allocatable_memory,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(platform_name == "AMD Accelerated Parallel Processing") {
-			/* This value is tweak-able; AMD platform does not seem to
-			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
-			 * is considered for further computation.
-			 */
-			total_allocatable_memory /= 2;
-		}
-	}
-
-	virtual bool show_samples() const {
-		return false;
-	}
-
-	/* Split kernel utility functions. */
-	size_t get_tex_size(const char *tex_name)
-	{
-		cl_mem ptr;
-		size_t ret_size = 0;
-		MemMap::iterator i = mem_map.find(tex_name);
-		if(i != mem_map.end()) {
-			ptr = CL_MEM_PTR(i->second);
-			ciErr = clGetMemObjectInfo(ptr,
-			                           CL_MEM_SIZE,
-			                           sizeof(ret_size),
-			                           &ret_size,
-			                           NULL);
-			assert(ciErr == CL_SUCCESS);
-		}
-		return ret_size;
-	}
-
-	size_t get_shader_data_size(size_t max_closure)
-	{
-		/* ShaderData size with variable size ShaderClosure array */
-		return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
-	}
-
-	/* Returns size of KernelGlobals structure associated with OpenCL. */
-	size_t get_KernelGlobals_size()
-	{
-		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
-		 * fetch its size.
-		 */
-		typedef struct KernelGlobals {
-			ccl_constant KernelData *data;
-#define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-			void *sd_input;
-			void *isect_shadow;
-		} KernelGlobals;
-
-		return sizeof(KernelGlobals);
-	}
+	OpenCLProgram program_state_buffer_size;
 
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
-	                          vector<OpenCLProgram*> &programs)
-	{
-		string build_options = "-D__SPLIT_KERNEL__ ";
-#ifdef __WORK_STEALING__
-		build_options += "-D__WORK_STEALING__ ";
-#endif
-		build_options += requested_features.get_build_options();
-
-		/* Set compute device build option. */
-		cl_device_type device_type;
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_TYPE,
-		                        sizeof(cl_device_type),
-		                        &device_type,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(device_type == CL_DEVICE_TYPE_GPU) {
-			build_options += " -D__COMPUTE_DEVICE_GPU__";
-		}
-
-#define GLUE(a, b) a ## b
-#define LOAD_KERNEL(name) \
-	do { \
-		GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \
-		GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \
-		programs.push_back(&GLUE(program_, name)); \
-	} while(false)
-
-		LOAD_KERNEL(data_init);
-		LOAD_KERNEL(scene_intersect);
-		LOAD_KERNEL(lamp_emission);
-		LOAD_KERNEL(queue_enqueue);
-		LOAD_KERNEL(background_buffer_update);
-		LOAD_KERNEL(shader_eval);
-		LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-		LOAD_KERNEL(direct_lighting);
-		LOAD_KERNEL(shadow_blocked);
-		LOAD_KERNEL(next_iteration_setup);
-		LOAD_KERNEL(sum_all_radiance);
-
-#undef FIND_KERNEL
-#undef GLUE
-
-		current_max_closure = requested_features.max_closure;
-	}
+	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
 
 	~OpenCLDeviceSplitKernel()
 	{
@@ -347,960 +70,298 @@ public:
 
 		/* Release kernels */
 		program_data_init.release();
-		program_scene_intersect.release();
-		program_lamp_emission.release();
-		program_queue_enqueue.release();
-		program_background_buffer_update.release();
-		program_shader_eval.release();
-		program_holdout_emission_blurring_pathtermination_ao.release();
-		program_direct_lighting.release();
-		program_shadow_blocked.release();
-		program_next_iteration_setup.release();
-		program_sum_all_radiance.release();
-
-		/* Release global memory */
-		release_mem_object_safe(rng_coop);
-		release_mem_object_safe(throughput_coop);
-		release_mem_object_safe(L_transparent_coop);
-		release_mem_object_safe(PathRadiance_coop);
-		release_mem_object_safe(Ray_coop);
-		release_mem_object_safe(PathState_coop);
-		release_mem_object_safe(Intersection_coop);
-		release_mem_object_safe(kgbuffer);
-		release_mem_object_safe(sd);
-		release_mem_object_safe(sd_DL_shadow);
-		release_mem_object_safe(ray_state);
-		release_mem_object_safe(AOAlpha_coop);
-		release_mem_object_safe(AOBSDF_coop);
-		release_mem_object_safe(AOLightRay_coop);
-		release_mem_object_safe(BSDFEval_coop);
-		release_mem_object_safe(ISLamp_coop);
-		release_mem_object_safe(LightRay_coop);
-		release_mem_object_safe(Intersection_coop_shadow);
-#ifdef WITH_CYCLES_DEBUG
-		release_mem_object_safe(debugdata_coop);
-#endif
-		release_mem_object_safe(use_queues_flag);
-		release_mem_object_safe(Queue_data);
-		release_mem_object_safe(Queue_index);
-		release_mem_object_safe(work_array);
-#ifdef __WORK_STEALING__
-		release_mem_object_safe(work_pool_wgs);
-#endif
-		release_mem_object_safe(per_sample_output_buffers);
-
-		if(hostRayStateArray != NULL) {
-			free(hostRayStateArray);
-		}
+
+		delete split_kernel;
 	}
 
-	void path_trace(DeviceTask *task,
-	                SplitRenderTile& rtile,
-	                int2 max_render_feasible_tile_size)
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
+	                          vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
 	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-		cl_int d_x = rtile.x;
-		cl_int d_y = rtile.y;
-		cl_int d_w = rtile.w;
-		cl_int d_h = rtile.h;
-		cl_int d_offset = rtile.offset;
-		cl_int d_stride = rtile.stride;
-
-		/* Make sure that set render feasible tile size is a multiple of local
-		 * work size dimensions.
-		 */
-		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
-		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
-
-		size_t global_size[2];
-		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
-		                        SPLIT_KERNEL_LOCAL_SIZE_Y};
+		program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  "split_data_init",
+		                                  "kernel_data_init.cl",
+		                                  get_build_options(this, requested_features));
+		program_data_init.add_kernel(ustring("path_trace_data_init"));
+		programs.push_back(&program_data_init);
+
+		program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  "split_state_buffer_size",
+		                                  "kernel_state_buffer_size.cl",
+		                                  get_build_options(this, requested_features));
+		program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
+		programs.push_back(&program_state_buffer_size);
+
+		return split_kernel->load_kernels(requested_features);
+	}
 
-		/* Set the range of samples to be processed for every ray in
-		 * path-regeneration logic.
-		 */
-		cl_int start_sample = rtile.start_sample;
-		cl_int end_sample = rtile.start_sample + rtile.num_samples;
-		cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
-		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_parallel_samples = 1;
-#else
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_threads = max_render_feasible_tile_size.x *
-		                           max_render_feasible_tile_size.y;
-		unsigned int num_tile_columns_possible = num_threads / global_size[1];
-		/* Estimate number of parallel samples that can be
-		 * processed in parallel.
-		 */
-		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
-		                                        rtile.num_samples);
-		/* Wavefront size in AMD is 64.
-		 * TODO(sergey): What about other platforms?
-		 */
-		if(num_parallel_samples >= 64) {
-			/* TODO(sergey): Could use generic round-up here. */
-			num_parallel_samples = (num_parallel_samples / 64) * 64;
+	void thread_run(DeviceTask *task)
+	{
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
 		}
-		assert(num_parallel_samples != 0);
-
-		global_size[0] = d_w * num_parallel_samples;
-#endif  /* __WORK_STEALING__ */
-
-		assert(global_size[0] * global_size[1] <=
-		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
-
-		/* Allocate all required global memory once. */
-		if(first_tile) {
-			size_t num_global_elements = max_render_feasible_tile_size.x *
-			                             max_render_feasible_tile_size.y;
-			/* TODO(sergey): This will actually over-allocate if
-			 * particular kernel does not support multiclosure.
-			 */
-			size_t shaderdata_size = get_shader_data_size(current_max_closure);
-
-#ifdef __WORK_STEALING__
-			/* Calculate max groups */
-			size_t max_global_size[2];
-			size_t tile_x = max_render_feasible_tile_size.x;
-			size_t tile_y = max_render_feasible_tile_size.y;
-			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
-			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
-			max_work_groups = (max_global_size[0] * max_global_size[1]) /
-			                  (local_size[0] * local_size[1]);
-			/* Allocate work_pool_wgs memory. */
-			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
-#endif  /* __WORK_STEALING__ */
-
-			/* Allocate queue_index memory only once. */
-			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
-			use_queues_flag = mem_alloc(sizeof(char));
-			kgbuffer = mem_alloc(get_KernelGlobals_size());
-
-			/* Create global buffers for ShaderData. */
-			sd = mem_alloc(num_global_elements * shaderdata_size);
-			sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
-
-			/* Creation of global memory buffers which are shared among
-			 * the kernels.
-			 */
-			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
-			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
-			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
-			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
-			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
-			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
-			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
-			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
-			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
-
-#ifdef WITH_CYCLES_DEBUG
-			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
-#endif
-
-			ray_state = mem_alloc(num_global_elements * sizeof(char));
-
-			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
-			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
-
-			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
-			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
-			per_sample_output_buffers = mem_alloc(num_global_elements *
-			                                      per_thread_output_buffer_size);
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
 		}
+		else if(task->type == DeviceTask::PATH_TRACE) {
+			RenderTile tile;
 
-		cl_int dQueue_size = global_size[0] * global_size[1];
-
-		cl_uint start_arg_index =
-			kernel_set_args(program_data_init(),
-			                0,
-			                kgbuffer,
-			                sd_DL_shadow,
-			                d_data,
-			                per_sample_output_buffers,
-			                d_rng_state,
-			                rng_coop,
-			                throughput_coop,
-			                L_transparent_coop,
-			                PathRadiance_coop,
-			                Ray_coop,
-			                PathState_coop,
-			                Intersection_coop_shadow,
-			                ray_state);
-
-/* TODO(sergey): Avoid map lookup here. */
+			/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+			 * fetch its size.
+			 */
+			typedef struct KernelGlobals {
+				ccl_constant KernelData *data;
 #define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(program_data_init(), &start_arg_index, #name);
+				ccl_global type *name;
 #include "kernel_textures.h"
 #undef KERNEL_TEX
+				SplitData split_data;
+				SplitParams split_param_data;
+			} KernelGlobals;
 
-		start_arg_index +=
-			kernel_set_args(program_data_init(),
-			                start_arg_index,
-			                start_sample,
-			                d_x,
-			                d_y,
-			                d_w,
-			                d_h,
-			                d_offset,
-			                d_stride,
-			                rtile.rng_state_offset_x,
-			                rtile.rng_state_offset_y,
-			                rtile.buffer_rng_state_stride,
-			                Queue_data,
-			                Queue_index,
-			                dQueue_size,
-			                use_queues_flag,
-			                work_array,
-#ifdef __WORK_STEALING__
-			                work_pool_wgs,
-			                num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-			                debugdata_coop,
-#endif
-			                num_parallel_samples);
-
-		kernel_set_args(program_scene_intersect(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-#ifdef WITH_CYCLES_DEBUG
-		                debugdata_coop,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(program_lamp_emission(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-		                num_parallel_samples);
-
-		kernel_set_args(program_queue_enqueue(),
-		                0,
-		                Queue_data,
-		                Queue_index,
-		                ray_state,
-		                dQueue_size);
-
-		kernel_set_args(program_background_buffer_update(),
-		                 0,
-		                 kgbuffer,
-		                 d_data,
-		                 per_sample_output_buffers,
-		                 d_rng_state,
-		                 rng_coop,
-		                 throughput_coop,
-		                 PathRadiance_coop,
-		                 Ray_coop,
-		                 PathState_coop,
-		                 L_transparent_coop,
-		                 ray_state,
-		                 d_w,
-		                 d_h,
-		                 d_x,
-		                 d_y,
-		                 d_stride,
-		                 rtile.rng_state_offset_x,
-		                 rtile.rng_state_offset_y,
-		                 rtile.buffer_rng_state_stride,
-		                 work_array,
-		                 Queue_data,
-		                 Queue_index,
-		                 dQueue_size,
-		                 end_sample,
-		                 start_sample,
-#ifdef __WORK_STEALING__
-		                 work_pool_wgs,
-		                 num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-		                 debugdata_coop,
-#endif
-		                 num_parallel_samples);
-
-		kernel_set_args(program_shader_eval(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                per_sample_output_buffers,
-		                rng_coop,
-		                throughput_coop,
-		                L_transparent_coop,
-		                PathRadiance_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                AOAlpha_coop,
-		                AOBSDF_coop,
-		                AOLightRay_coop,
-		                d_w,
-		                d_h,
-		                d_x,
-		                d_y,
-		                d_stride,
-		                ray_state,
-		                work_array,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-#ifdef __WORK_STEALING__
-		                start_sample,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(program_direct_lighting(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                PathState_coop,
-		                ISLamp_coop,
-		                LightRay_coop,
-		                BSDFEval_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_shadow_blocked(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                PathState_coop,
-		                LightRay_coop,
-		                AOLightRay_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_next_iteration_setup(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                LightRay_coop,
-		                ISLamp_coop,
-		                BSDFEval_coop,
-		                AOLightRay_coop,
-		                AOBSDF_coop,
-		                AOAlpha_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag);
-
-		kernel_set_args(program_sum_all_radiance(),
-		                0,
-		                d_data,
-		                d_buffer,
-		                per_sample_output_buffers,
-		                num_parallel_samples,
-		                d_w,
-		                d_h,
-		                d_stride,
-		                rtile.buffer_offset_x,
-		                rtile.buffer_offset_y,
-		                rtile.buffer_rng_state_stride,
-		                start_sample);
-
-		/* Macro for Enqueuing split kernels. */
-#define GLUE(a, b) a ## b
-#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
-		{ \
-			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
-			                               GLUE(program_, \
-			                                    kernelName)(), \
-			                               2, \
-			                               NULL, \
-			                               globalSize, \
-			                               localSize, \
-			                               0, \
-			                               NULL, \
-			                               NULL); \
-			opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
-			if(ciErr != CL_SUCCESS) { \
-				string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
-				                               clewErrorString(ciErr)); \
-				opencl_error(message); \
-				return; \
-			} \
-		} (void) 0
-
-		/* Enqueue ckPathTraceKernel_data_init kernel. */
-		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
-		bool activeRaysAvailable = true;
-
-		/* Record number of time host intervention has been made */
-		unsigned int numHostIntervention = 0;
-		unsigned int numNextPathIterTimes = PathIteration_times;
-		bool canceled = false;
-		while(activeRaysAvailable) {
-			/* Twice the global work size of other kernels for
-			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
-			size_t global_size_shadow_blocked[2];
-			global_size_shadow_blocked[0] = global_size[0] * 2;
-			global_size_shadow_blocked[1] = global_size[1];
-
-			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
-			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
-				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
-				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-
-				if(task->get_cancel()) {
-					canceled = true;
-					break;
-				}
-			}
+			/* Allocate buffer for kernel globals */
+			device_memory kgbuffer;
+			kgbuffer.resize(sizeof(KernelGlobals));
+			mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
 
-			/* Read ray-state into Host memory to decide if we should exit
-			 * path-iteration in host.
-			 */
-			ciErr = clEnqueueReadBuffer(cqCommandQueue,
-			                            ray_state,
-			                            CL_TRUE,
-			                            0,
-			                            global_size[0] * global_size[1] * sizeof(char),
-			                            hostRayStateArray,
-			                            0,
-			                            NULL,
-			                            NULL);
-			assert(ciErr == CL_SUCCESS);
-
-			activeRaysAvailable = false;
-
-			for(int rayStateIter = 0;
-			    rayStateIter < global_size[0] * global_size[1];
-			    ++rayStateIter)
-			{
-				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
-					/* Not all rays are RAY_INACTIVE. */
-					activeRaysAvailable = true;
-					break;
-				}
-			}
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				split_kernel->path_trace(task,
+				                         tile,
+				                         kgbuffer,
+				                         *const_mem_map["__data"]);
 
-			if(activeRaysAvailable) {
-				numHostIntervention++;
-				PathIteration_times = PATH_ITER_INC_FACTOR;
-				/* Host intervention done before all rays become RAY_INACTIVE;
-				 * Set do more initial iterations for the next tile.
+				/* Complete kernel execution before release tile. */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
 				 */
-				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
-			}
+				clFinish(cqCommandQueue);
 
-			if(task->get_cancel()) {
-				canceled = true;
-				break;
+				task->release_tile(tile);
 			}
-		}
 
-		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
-		 * per_sample_output_buffers into RenderTile's output buffer.
-		 */
-		if(!canceled) {
-			size_t sum_all_radiance_local_size[2] = {16, 16};
-			size_t sum_all_radiance_global_size[2];
-			sum_all_radiance_global_size[0] =
-				(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
-				sum_all_radiance_local_size[0];
-			sum_all_radiance_global_size[1] =
-				(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
-				sum_all_radiance_local_size[1];
-			ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
-			                     sum_all_radiance_global_size,
-			                     sum_all_radiance_local_size);
-		}
-
-#undef ENQUEUE_SPLIT_KERNEL
-#undef GLUE
-
-		if(numHostIntervention == 0) {
-			/* This means that we are executing kernel more than required
-			 * Must avoid this for the next sample/tile.
-			 */
-			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
-			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
+			mem_free(kgbuffer);
 		}
-		else {
-			/* Number of path-iterations done for this tile is set as
-			 * Initial path-iteration times for the next tile
-			 */
-			PathIteration_times = numNextPathIterTimes;
-		}
-
-		first_tile = false;
 	}
 
-	/* Calculates the amount of memory that has to be always
-	 * allocated in order for the split kernel to function.
-	 * This memory is tile/scene-property invariant (meaning,
-	 * the value returned by this function does not depend
-	 * on the user set tile size or scene properties.
-	 */
-	size_t get_invariable_mem_allocated()
-	{
-		size_t total_invariable_mem_allocated = 0;
-		size_t KernelGlobals_size = 0;
-
-		KernelGlobals_size = get_KernelGlobals_size();
-
-		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
-		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
-		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
-
-		return total_invariable_mem_allocated;
-	}
+protected:
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
 
-	/* Calculate the memory that has-to-be/has-been allocated for
-	 * the split kernel to function.
-	 */
-	size_t get_tile_specific_mem_allocated(const int2 tile_size)
+	string build_options_for_base_program(
+	        const DeviceRequestedFeatures& requested_features)
 	{
-		size_t tile_specific_mem_allocated = 0;
-
-		/* Get required tile info */
-		unsigned int user_set_tile_w = tile_size.x;
-		unsigned int user_set_tile_h = tile_size.y;
-
-#ifdef __WORK_STEALING__
-		/* Calculate memory to be allocated for work_pools in
-		 * case of work_stealing.
-		 */
-		size_t max_global_size[2];
-		size_t max_num_work_pools = 0;
-		max_global_size[0] =
-			(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		max_global_size[1] =
-			(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		max_num_work_pools =
-			(max_global_size[0] * max_global_size[1]) /
-			(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
-		tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
-#endif
-
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * sizeof(RNG);
-
-		return tile_specific_mem_allocated;
+		return requested_features.get_build_options();
 	}
 
-	/* Calculates the texture memories and KernelData (d_data) memory
-	 * that has been allocated.
-	 */
-	size_t get_scene_specific_mem_allocated(cl_mem d_data)
-	{
-		size_t scene_specific_mem_allocated = 0;
-		/* Calculate texture memories. */
-#define KERNEL_TEX(type, ttype, name) \
-	scene_specific_mem_allocated += get_tex_size(#name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-		size_t d_data_size;
-		ciErr = clGetMemObjectInfo(d_data,
-		                           CL_MEM_SIZE,
-		                           sizeof(d_data_size),
-		                           &d_data_size,
-		                           NULL);
-		assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
-		scene_specific_mem_allocated += d_data_size;
-		return scene_specific_mem_allocated;
-	}
+	friend class OpenCLSplitKernel;
+	friend class OpenCLSplitKernelFunction;
+};
 
-	/* Calculate the memory required for one thread in split kernel. */
-	size_t get_per_thread_memory()
-	{
-		size_t shaderdata_size = 0;
-		/* TODO(sergey): This will actually over-allocate if
-		 * particular kernel does not support multiclosure.
-		 */
-		shaderdata_size = get_shader_data_size(current_max_closure);
-		size_t retval = sizeof(RNG)
-			+ sizeof(float3)          /* Throughput size */
-			+ sizeof(float)           /* L transparent size */
-			+ sizeof(char)            /* Ray state size */
-			+ sizeof(unsigned int)    /* Work element size */
-			+ sizeof(int)             /* ISLamp_size */
-			+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
-			+ sizeof(Intersection)    /* Overall isect */
-			+ sizeof(Intersection)    /* Instersection_coop_AO */
-			+ sizeof(Intersection)    /* Intersection coop DL */
-			+ shaderdata_size         /* Overall ShaderData */
-			+ (shaderdata_size * 2)   /* ShaderData : DL and shadow */
-			+ sizeof(Ray) + sizeof(BsdfEval)
-			+ sizeof(float3)          /* AOAlpha size */
-			+ sizeof(float3)          /* AOBSDF size */
-			+ sizeof(Ray)
-			+ (sizeof(int) * NUM_QUEUES)
-			+ per_thread_output_buffer_size;
-		return retval;
-	}
+class OpenCLSplitKernelFunction : public SplitKernelFunction {
+public:
+	OpenCLDeviceSplitKernel* device;
+	OpenCLDeviceBase::OpenCLProgram program;
 
-	/* Considers the total memory available in the device and
-	 * and returns the maximum global work size possible.
-	 */
-	size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
-	{
-		/* Calculate invariably allocated memory. */
-		size_t invariable_mem_allocated = get_invariable_mem_allocated();
-		/* Calculate tile specific allocated memory. */
-		size_t tile_specific_mem_allocated =
-			get_tile_specific_mem_allocated(tile_size);
-		/* Calculate scene specific allocated memory. */
-		size_t scene_specific_mem_allocated =
-			get_scene_specific_mem_allocated(d_data);
-		/* Calculate total memory available for the threads in global work size. */
-		size_t available_memory = total_allocatable_memory
-			- invariable_mem_allocated
-			- tile_specific_mem_allocated
-			- scene_specific_mem_allocated
-			- DATA_ALLOCATION_MEM_FACTOR;
-		size_t per_thread_memory_required = get_per_thread_memory();
-		return (available_memory / per_thread_memory_required);
-	}
+	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
+	~OpenCLSplitKernelFunction() { program.release(); }
 
-	/* Checks if the device has enough memory to render the whole tile;
-	 * If not, we should split single tile into multiple tiles of small size
-	 * and process them all.
-	 */
-	bool need_to_split_tile(unsigned int d_w,
-	                        unsigned int d_h,
-	                        int2 max_render_feasible_tile_size)
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
 	{
-		size_t global_size_estimate[2];
-		/* TODO(sergey): Such round-ups are in quite few places, need to replace
-		 * them with an utility macro.
-		 */
-		global_size_estimate[0] =
-			(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		global_size_estimate[1] =
-			(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if((global_size_estimate[0] * global_size_estimate[1]) >
-		   (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
-		{
-			return true;
-		}
-		else {
+		device->kernel_set_args(program(), 0, kg, data);
+
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                                       program(),
+		                                       2,
+		                                       NULL,
+		                                       dim.global_size,
+		                                       dim.local_size,
+		                                       0,
+		                                       NULL,
+		                                       NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
 			return false;
 		}
+
+		return true;
 	}
+};
 
-	/* Considers the scene properties, global memory available in the device
-	 * and returns a rectanglular tile dimension (approx the maximum)
-	 * that should render on split kernel.
-	 */
-	int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
-	{
-		int2 max_render_feasible_tile_size;
-		int square_root_val = (int)sqrt(feasible_global_work_size);
-		max_render_feasible_tile_size.x = square_root_val;
-		max_render_feasible_tile_size.y = square_root_val;
-		/* Ciel round-off max_render_feasible_tile_size. */
-		int2 ceil_render_feasible_tile_size;
-		ceil_render_feasible_tile_size.x =
-			(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		ceil_render_feasible_tile_size.y =
-			(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
-		   feasible_global_work_size)
-		{
-			return ceil_render_feasible_tile_size;
-		}
-		/* Floor round-off max_render_feasible_tile_size. */
-		int2 floor_render_feasible_tile_size;
-		floor_render_feasible_tile_size.x =
-			(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		floor_render_feasible_tile_size.y =
-			(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		return floor_render_feasible_tile_size;
+class OpenCLSplitKernel : public DeviceSplitKernel {
+	OpenCLDeviceSplitKernel *device;
+public:
+	explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
 	}
 
-	/* Try splitting the current tile into multiple smaller
-	 * almost-square-tiles.
-	 */
-	int2 get_split_tile_size(RenderTile rtile,
-	                         int2 max_render_feasible_tile_size)
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
+	                                                       const DeviceRequestedFeatures& requested_features)
 	{
-		int2 split_tile_size;
-		int num_global_threads = max_render_feasible_tile_size.x *
-		                         max_render_feasible_tile_size.y;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		/* Ceil round off d_w and d_h */
-		d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		while(d_w * d_h > num_global_threads) {
-			/* Halve the longer dimension. */
-			if(d_w >= d_h) {
-				d_w = d_w / 2;
-				d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_X;
-			}
-			else {
-				d_h = d_h / 2;
-				d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_Y;
-			}
+		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
+
+		kernel->program = OpenCLDeviceBase::OpenCLProgram(device,
+		                                "split_" + kernel_name,
+		                                "kernel_" + kernel_name + ".cl",
+		                                get_build_options(device, requested_features));
+		kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
+		kernel->program.load();
+
+		if(!kernel->program.is_loaded()) {
+			delete kernel;
+			return NULL;
 		}
-		split_tile_size.x = d_w;
-		split_tile_size.y = d_h;
-		return split_tile_size;
+
+		return kernel;
 	}
 
-	/* Splits existing tile into multiple tiles of tile size split_tile_size. */
-	vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
+	virtual size_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
 	{
-		vector<SplitRenderTile> to_path_trace_rtile;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
-		int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
-		/* Buffer and rng_state offset calc. */
-		size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
-		size_t offset_x = offset_index % rtile.stride;
-		size_t offset_y = offset_index / rtile.stride;
-		/* Resize to_path_trace_rtile. */
-		to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
-		for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
-			for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
-				int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
-				to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
-				to_path_trace_rtile[rtile_index].sample = rtile.sample;
-				to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
-				to_path_trace_rtile[rtile_index].offset = rtile.offset;
-				to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
-				to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
-				to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
-				to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
-				to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
-				to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
-				/* Fill width and height of the new render tile. */
-				to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
-					(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
-					: split_tile_size.x;
-				to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
-					(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
-					: split_tile_size.y;
-				to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
-			}
+		device_vector<uint> size_buffer;
+		size_buffer.resize(1);
+		device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+		uint threads = num_threads;
+		device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
+
+		size_t global_size = 64;
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_state_buffer_size(),
+		                               1,
+		                               NULL,
+		                               &global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint));
+		device->mem_free(size_buffer);
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return 0;
 		}
-		return to_path_trace_rtile;
+
+		return *size_buffer.get_data();
 	}
 
-	void thread_run(DeviceTask *task)
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs
+	                                            )
 	{
-		if(task->type == DeviceTask::FILM_CONVERT) {
-			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-		}
-		else if(task->type == DeviceTask::SHADER) {
-			shader(*task);
-		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
-			RenderTile tile;
-			bool initialize_data_and_check_render_feasibility = false;
-			bool need_to_split_tiles_further = false;
-			int2 max_render_feasible_tile_size;
-			size_t feasible_global_work_size;
-			const int2 tile_size = task->requested_tile_size;
-			/* Keep rendering tiles until done. */
-			while(task->acquire_tile(this, tile)) {
-				if(!initialize_data_and_check_render_feasibility) {
-					/* Initialize data. */
-					/* Calculate per_thread_output_buffer_size. */
-					size_t output_buffer_size = 0;
-					ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
-					                           CL_MEM_SIZE,
-					                           sizeof(output_buffer_size),
-					                           &output_buffer_size,
-					                           NULL);
-					assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
-					/* This value is different when running on AMD and NV. */
-					if(background) {
-						/* In offline render the number of buffer elements
-						 * associated with tile.buffer is the current tile size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.w * tile.h);
-					}
-					else {
-						/* interactive rendering, unlike offline render, the number of buffer elements
-						 * associated with tile.buffer is the entire viewport size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.buffers->params.width *
-							                      tile.buffers->params.height);
-					}
-					/* Check render feasibility. */
-					feasible_global_work_size = get_feasible_global_work_size(
-						tile_size,
-						CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
-					max_render_feasible_tile_size =
-						get_max_render_feasible_tile_size(
-							feasible_global_work_size);
-					need_to_split_tiles_further =
-						need_to_split_tile(tile_size.x,
-						                   tile_size.y,
-						                   max_render_feasible_tile_size);
-					initialize_data_and_check_render_feasibility = true;
-				}
-				if(need_to_split_tiles_further) {
-					int2 split_tile_size =
-						get_split_tile_size(tile,
-						                    max_render_feasible_tile_size);
-					vector<SplitRenderTile> to_path_trace_render_tiles =
-						split_tiles(tile, split_tile_size);
-					/* Print message to console */
-					if(background && (to_path_trace_render_tiles.size() > 1)) {
-						fprintf(stderr, "Message : Tiles need to be split "
-						        "further inside path trace (due to insufficient "
-						        "device-global-memory for split kernel to "
-						        "function) \n"
-						        "The current tile of dimensions %dx%d is split "
-						        "into tiles of dimension %dx%d for render \n",
-						        tile.w, tile.h,
-						        split_tile_size.x,
-						        split_tile_size.y);
-					}
-					/* Process all split tiles. */
-					for(int tile_iter = 0;
-					    tile_iter < to_path_trace_render_tiles.size();
-					    ++tile_iter)
-					{
-						path_trace(task,
-						           to_path_trace_render_tiles[tile_iter],
-						           max_render_feasible_tile_size);
-					}
-				}
-				else {
-					/* No splitting required; process the entire tile at once. */
-					/* Render feasible tile size is user-set-tile-size itself. */
-					max_render_feasible_tile_size.x =
-						(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_X;
-					max_render_feasible_tile_size.y =
-						(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_Y;
-					/* buffer_rng_state_stride is stride itself. */
-					SplitRenderTile split_tile(tile);
-					split_tile.buffer_rng_state_stride = tile.stride;
-					path_trace(task, split_tile, max_render_feasible_tile_size);
-				}
-				tile.sample = tile.start_sample + tile.num_samples;
+		cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
 
-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
+		/* Set the range of samples to be processed for every ray in
+		 * path-regeneration logic.
+		 */
+		cl_int start_sample = rtile.start_sample;
+		cl_int end_sample = rtile.start_sample + rtile.num_samples;
 
-				task->release_tile(tile);
-			}
+		cl_uint start_arg_index =
+			device->kernel_set_args(device->program_data_init(),
+			                0,
+			                kernel_globals,
+			                kernel_data,
+			                split_data,
+			                num_global_elements,
+			                ray_state,
+			                rtile.rng_state);
+
+/* TODO(sergey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+	device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+		start_arg_index +=
+			device->kernel_set_args(device->program_data_init(),
+			                start_arg_index,
+			                start_sample,
+			                end_sample,
+			                rtile.x,
+			                rtile.y,
+			                rtile.w,
+			                rtile.h,
+			                rtile.offset,
+			                rtile.stride,
+			                queue_index,
+			                dQueue_size,
+			                use_queues_flag,
+			                work_pool_wgs,
+			                rtile.num_samples,
+			                rtile.buffer);
+
+		/* Enqueue ckPathTraceKernel_data_init kernel. */
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_data_init(),
+		                               2,
+		                               NULL,
+		                               dim.global_size,
+		                               dim.local_size,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return false;
 		}
+
+		return true;
 	}
 
-protected:
-	cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
+	virtual int2 split_kernel_local_size()
 	{
-		cl_mem ptr;
-		assert(bufsize != 0);
-		ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
-		opencl_assert_err(ciErr, "clCreateBuffer");
-		return ptr;
+		return make_int2(64, 1);
 	}
 
-	/* ** Those guys are for workign around some compiler-specific bugs ** */
-
-	string build_options_for_base_program(
-	        const DeviceRequestedFeatures& requested_features)
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
 	{
-		return requested_features.get_build_options();
+		size_t max_buffer_size;
+		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_buffer_size, NULL);
+		VLOG(1) << "Maximum device allocation side: "
+		        << string_human_readable_number(max_buffer_size) << " bytes. ("
+		        << string_human_readable_size(max_buffer_size) << ").";
+
+		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
+		int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
+		VLOG(1) << "Global size: " << global_size << ".";
+		return global_size;
 	}
 };
 
+OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+: OpenCLDeviceBase(info, stats, background_)
+{
+	split_kernel = new OpenCLSplitKernel(this);
+
+	background = background_;
+}
+
 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
 {
 	return new OpenCLDeviceSplitKernel(info, stats, background);
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 82e1640e508..d5c19bf5386 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -19,6 +19,7 @@
 #include "opencl.h"
 
 #include "util_logging.h"
+#include "util_md5.h"
 #include "util_path.h"
 #include "util_time.h"
 
@@ -309,6 +310,8 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 	string build_options;
 	build_options = device->kernel_build_options(debug_src) + kernel_build_options;
 
+	VLOG(1) << "Build options passed to clBuildProgram: '"
+	        << build_options << "'.";
 	cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
 
 	/* show warnings even if build is successful */
@@ -336,12 +339,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 
 bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 {
-	string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
+	string source = "#include \"kernels/opencl/" + kernel_file + "\"\n";
 	/* We compile kernels consisting of many files. unfortunately OpenCL
 	 * kernel caches do not seem to recognize changes in included files.
 	 * so we force recompile on changes by adding the md5 hash of all files.
 	 */
 	source = path_source_replace_includes(source, path_get("kernel"));
+	source += "\n// " + util_md5_string(source) + "\n";
 
 	if(debug_src) {
 		path_write_text(*debug_src, source);
@@ -438,7 +442,11 @@ void OpenCLDeviceBase::OpenCLProgram::load()
 	if(!program) {
 		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
 
-		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
+		/* need to create source to get md5 */
+		string source = "#include \"kernels/opencl/" + kernel_file + "\"\n";
+		source = path_source_replace_includes(source, path_get("kernel"));
+
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
 		basename = path_cache_get(path_join("kernels", basename));
 		string clbin = basename + ".clbin";
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 29e0f44841e..1c740b5c6eb 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -13,8 +13,11 @@ set(INC_SYS
 
 set(SRC
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_split.cpp
 	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_state_buffer_size.cl
 	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_path_init.cl
 	kernels/opencl/kernel_queue_enqueue.cl
 	kernels/opencl/kernel_scene_intersect.cl
 	kernels/opencl/kernel_lamp_emission.cl
@@ -24,8 +27,8 @@ set(SRC
 	kernels/opencl/kernel_direct_lighting.cl
 	kernels/opencl/kernel_shadow_blocked.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
-	kernels/opencl/kernel_sum_all_radiance.cl
 	kernels/cuda/kernel.cu
+	kernels/cuda/kernel_split.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -88,6 +91,10 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu_image.h
 )
 
+set(SRC_KERNELS_CUDA_HEADERS
+	kernels/cuda/kernel_config.h
+)
+
 set(SRC_CLOSURE_HEADERS
 	closure/alloc.h
 	closure/bsdf.h
@@ -195,11 +202,14 @@ set(SRC_SPLIT_HEADERS
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
 	split/kernel_lamp_emission.h
 	split/kernel_next_iteration_setup.h
+	split/kernel_path_init.h
+	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
 	split/kernel_shader_eval.h
 	split/kernel_shadow_blocked.h
 	split/kernel_split_common.h
-	split/kernel_sum_all_radiance.h
+	split/kernel_split_data.h
+	split/kernel_split_data_types.h
 )
 
 # CUDA module
@@ -227,8 +237,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu
+	set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
 		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
 		${SRC_BVH_HEADERS}
 		${SRC_SVM_HEADERS}
 		${SRC_GEOM_HEADERS}
@@ -237,15 +248,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	)
 	set(cuda_cubins)
 
-	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
-		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
-			set(cuda_cubin kernel_experimental_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
+		if(${split})
+			set(cuda_extra_flags "-D__SPLIT__")
+			set(cuda_cubin kernel_split)
 		else()
 			set(cuda_extra_flags "")
-			set(cuda_cubin kernel_${arch}.cubin)
+			set(cuda_cubin kernel)
+		endif()
+
+		if(${experimental})
+			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
+			set(cuda_cubin ${cuda_cubin}_experimental)
 		endif()
 
+		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
 		else()
@@ -258,13 +276,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")
 
+		if(split)
+			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
+		else()
+			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
+		endif()
+
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
 			COMMAND ${cuda_nvcc_command}
 					-arch=${arch}
 					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
@@ -291,7 +315,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+
+		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+			# Compile split kernel
+			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+		endif()
 	endforeach()
 
 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -314,31 +343,42 @@ if(CXX_HAS_SSE)
 		kernels/cpu/kernel_sse2.cpp
 		kernels/cpu/kernel_sse3.cpp
 		kernels/cpu/kernel_sse41.cpp
+		kernels/cpu/kernel_split_sse2.cpp
+		kernels/cpu/kernel_split_sse3.cpp
+		kernels/cpu/kernel_split_sse41.cpp
 	)
 
 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx.cpp
+		kernels/cpu/kernel_split_avx.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx2.cpp
+		kernels/cpu/kernel_split_avx2.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel
 	${SRC}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_KERNELS_CUDA_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_SVM_HEADERS}
@@ -361,7 +401,9 @@ endif()
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
@@ -371,9 +413,10 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emiss
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index b7abc1ec507..4894ea58dba 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
 {
 	kernel_assert(size <= sizeof(ShaderClosure));
 
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra);
+	int num_closure = sd->num_closure;
+	int num_closure_extra = sd->num_closure_extra;
 	if(num_closure + num_closure_extra >= MAX_CLOSURE)
 		return NULL;
 
-	ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure];
+	ShaderClosure *sc = &sd->closure[num_closure];
 
 	sc->type = type;
 	sc->weight = weight;
 
-	ccl_fetch(sd, num_closure)++;
+	sd->num_closure++;
 
 	return sc;
 }
@@ -44,18 +44,18 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
 	 * This lets us keep the same fast array iteration over closures, as we
 	 * found linked list iteration and iteration with skipping to be slower. */
 	int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra;
+	int num_closure = sd->num_closure;
+	int num_closure_extra = sd->num_closure_extra + num_extra;
 
 	if(num_closure + num_closure_extra > MAX_CLOSURE) {
 		/* Remove previous closure. */
-		ccl_fetch(sd, num_closure)--;
-		ccl_fetch(sd, num_closure_extra)++;
+		sd->num_closure--;
+		sd->num_closure_extra++;
 		return NULL;
 	}
 
-	ccl_fetch(sd, num_closure_extra) = num_closure_extra;
-	return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
+	sd->num_closure_extra = num_closure_extra;
+	return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra);
 }
 
 ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 7e4d5fe2e37..a44b9e2d9b9 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -51,89 +51,89 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __OSL__
 		case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-			label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+			label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -157,75 +157,75 @@ float3 bsdf_eval(KernelGlobals *kg,
 {
 	float3 eval;
 
-	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
+	if(dot(sd->Ng, omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __OSL__
 			case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
@@ -237,63 +237,63 @@ float3 bsdf_eval(KernelGlobals *kg,
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 08ccee56335..cc62192ef21 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
 ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return ATTR_PRIM_CURVE;
 	}
 	else
@@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id)
 {
-	if(ccl_fetch(sd, object) == PRIM_NONE) {
+	if(sd->object == PRIM_NONE) {
 		return attribute_not_found();
 	}
 
 	/* for SVM, find attribute by unique id */
-	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
 	attr_offset += attribute_primitive_type(kg, sd);
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
@@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh
 	AttributeDescriptor desc;
 	desc.element = (AttributeElement)attr_map.y;
 	
-	if(ccl_fetch(sd, prim) == PRIM_NONE &&
+	if(sd->prim == PRIM_NONE &&
 	   desc.element != ATTR_ELEMENT_MESH &&
 	   desc.element != ATTR_ELEMENT_VOXEL &&
 	   desc.element != ATTR_ELEMENT_OBJECT)
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 9de335403ce..7cc840ce78d 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -32,22 +32,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +71,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +104,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float4 P_curve[2];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
 		}
 
-		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
 	}
 
 	return r*2.0f;
@@ -130,8 +130,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 	int k1 = k0 + 1;
 
 	float4 P_curve[2];
@@ -139,7 +139,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 
-	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
+	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
 }
 
 /* Curve tangent normal */
@@ -148,14 +148,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
 {	
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 
-		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
+		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
 		tgN = normalize(tgN);
 
 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
+		tgN = normalize(tgN - gd * sd->dPdu);
 #endif
 	}
 
@@ -229,6 +229,15 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
 #endif
 {
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
 	float epsilon = 0.0f;
 	float r_st, r_en;
@@ -257,7 +266,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 
 #ifdef __KERNEL_AVX2__
 		avxf P_curve_0_1, P_curve_2_3;
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
 			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
 		}
@@ -268,7 +277,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 #else  /* __KERNEL_AVX2__ */
 		ssef P_curve[4];
 
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
 			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
 			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
@@ -363,7 +372,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 
 		float4 P_curve[4];
 
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
@@ -689,6 +698,15 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #  define dot3(x, y) dot(x, y)
 #endif
 
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
 	/* curve Intersection check */
 	int flags = kernel_data.curve.curveflags;
@@ -703,7 +721,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #ifndef __KERNEL_SSE2__
 	float4 P_curve[2];
 
-	if(type & PRIMITIVE_CURVE) {
+	if(is_curve_primitive) {
 		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
 		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
@@ -738,7 +756,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #else
 	ssef P_curve[2];
 	
-	if(type & PRIMITIVE_CURVE) {
+	if(is_curve_primitive) {
 		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
 		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
 	}
@@ -948,7 +966,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -961,7 +979,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 	int prim = kernel_tex_fetch(__prim_index, isect->prim);
 	float4 v00 = kernel_tex_fetch(__curves, prim);
 
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 	int k1 = k0 + 1;
 
 	float3 tg;
@@ -972,14 +990,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 		float4 P_curve[4];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
 			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
 		}
 		else {
-			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
+			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
 		}
 
 		float3 p[4];
@@ -991,43 +1009,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		P = P + D*t;
 
 #ifdef __UV__
-		ccl_fetch(sd, u) = isect->u;
-		ccl_fetch(sd, v) = 0.0f;
+		sd->u = isect->u;
+		sd->v = 0.0f;
 #endif
 
 		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 
 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
 			/* direction from inside to surface of curve */
 			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			ccl_fetch(sd, Ng) = normalize(P - p_curr);
+			sd->Ng = normalize(P - p_curr);
 
 			/* adjustment for changing radius */
 			float gd = isect->v;
 
 			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
 			}
 		}
 
 		/* todo: sometimes the normal is still so that this is detected as
 		 * backfacing even if cull backfaces is enabled */
 
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
+		sd->N = sd->Ng;
 	}
 	else {
 		float4 P_curve[2];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
 		}
 
 		float l = 1.0f;
@@ -1038,39 +1056,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		float3 dif = P - float4_to_float3(P_curve[0]);
 
 #ifdef __UV__
-		ccl_fetch(sd, u) = dot(dif,tg)/l;
-		ccl_fetch(sd, v) = 0.0f;
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
 #endif
 
 		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
-			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
 		}
 		else {
 			float gd = isect->v;
 
 			/* direction from inside to surface of curve */
-			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
+			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
 
 			/* adjustment for changing radius */
 			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
 			}
 		}
 
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
+		sd->N = sd->Ng;
 	}
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = tg;
-	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
 #endif
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index d57d74ea882..2500228281e 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -48,7 +48,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -77,7 +77,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -116,7 +116,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 #  ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
 #    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #    else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -144,7 +144,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #    else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 0e024a05db6..cb456056e20 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -39,26 +39,26 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
                                                       bool subsurface)
 {
 	/* Get shader. */
-	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+	sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 	/* Get motion info. */
 	/* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
 	 * can we de-duplicate something here?
 	 */
 	int numsteps, numverts;
-	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
+	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
 	/* Figure out which steps we need to fetch and their interpolation factor. */
 	int maxstep = numsteps*2;
-	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
-	float t = ccl_fetch(sd, time)*maxstep - step;
+	int step = min((int)(sd->time*maxstep), maxstep-1);
+	float t = sd->time*maxstep - step;
 	/* Find attribute. */
 	AttributeElement elem;
-	int offset = find_attribute_motion(kg, ccl_fetch(sd, object),
+	int offset = find_attribute_motion(kg, sd->object,
 	                                   ATTR_STD_MOTION_VERTEX_POSITION,
 	                                   &elem);
 	kernel_assert(offset != ATTR_STD_NOT_FOUND);
 	/* Fetch vertex coordinates. */
 	float3 verts[3], next_verts[3];
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
 	/* Interpolate between steps. */
@@ -68,7 +68,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 	/* Compute refined position. */
 #ifdef __SUBSURFACE__
 	if(subsurface) {
-		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg,
+		sd->P = motion_triangle_refine_subsurface(kg,
 		                                                     sd,
 		                                                     isect,
 		                                                     ray,
@@ -77,29 +77,29 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 	else
 #endif  /*  __SUBSURFACE__*/
 	{
-		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
+		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
 	}
 	/* Compute face normal. */
 	float3 Ng;
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
 	}
 	else {
 		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 	}
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, N) = Ng;
+	sd->Ng = Ng;
+	sd->N = Ng;
 	/* Compute derivatives of P w.r.t. uv. */
 #ifdef __DPDU__
-	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
-	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
+	sd->dPdu = (verts[0] - verts[2]);
+	sd->dPdv = (verts[1] - verts[2]);
 #endif
 	/* Compute smooth normal. */
-	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+	if(sd->shader & SHADER_SMOOTH_NORMAL) {
 		/* Find attribute. */
 		AttributeElement elem;
 		int offset = find_attribute_motion(kg,
-		                                   ccl_fetch(sd, object),
+		                                   sd->object,
 		                                   ATTR_STD_MOTION_VERTEX_NORMAL,
 		                                   &elem);
 		kernel_assert(offset != ATTR_STD_NOT_FOUND);
@@ -112,10 +112,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
 		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
 		/* Interpolate between vertices. */
-		float u = ccl_fetch(sd, u);
-		float v = ccl_fetch(sd, v);
+		float u = sd->u;
+		float v = sd->v;
 		float w = 1.0f - u - v;
-		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
+		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
 	}
 }
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index f51b2d18657..5a04be8b0bf 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -137,9 +137,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
+	*P = transform_point_auto(&sd->ob_tfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -149,9 +149,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
+	*P = transform_point_auto(&sd->ob_itfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -161,12 +161,12 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) {
-		*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
+	if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+		*N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
 	}
 #else
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	if(sd->object != OBJECT_NONE) {
+		Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 		*N = normalize(transform_direction_transposed(&tfm, *N));
 	}
 #endif
@@ -177,9 +177,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
+	*N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -189,9 +189,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
+	*D = transform_direction_auto(&sd->ob_tfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -201,9 +201,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
+	*D = transform_direction_auto(&sd->ob_itfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -212,13 +212,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(ccl_fetch(sd, object) == OBJECT_NONE)
+	if(sd->object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
+	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -326,7 +326,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1);
+	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE + 1);
 }
 
 /* Particle data from which object was instanced */
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 6a0ff5a4a04..5663b598508 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float val = 0.0f;
@@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
@@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 8a73bb2f78b..90a9c2147cc 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg,
                                                   const AttributeDescriptor desc,
                                                   float *dx, float *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
                                                     const AttributeDescriptor desc,
                                                     float3 *dx, float3 *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float3(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
+	if(sd->type & PRIMITIVE_ALL_CURVE)
 #  ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #  else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #  endif
@@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
+		return cross(sd->N, normalize(cross(data, sd->N)));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -153,17 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	float3 center;
 
 #ifdef __HAIR__
-	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
+	bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
 	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &center);
 		}
 	}
 	else
 #endif
-		center = ccl_fetch(sd, P);
+		center = sd->P;
 
 	float3 motion_pre = center, motion_post = center;
 
@@ -173,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	if(desc.offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
+		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
 		motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
-		desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
+		desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
 		motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
 #ifdef __HAIR__
-		if(is_curve_primitive && (ccl_fetch(sd, object_flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+		if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
 			object_position_transform(kg, sd, &motion_pre);
 			object_position_transform(kg, sd, &motion_post);
 		}
@@ -193,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);
 
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
 	float3 motion_center;
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 647840dc696..044e82f03d4 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
 {
-	return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0;
+	return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3])
 {
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 	uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x);
 	uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y);
@@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float a, dads, dadt;
 		a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
 		float2 uv[3];
@@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float3 a, dads, dadt;
 
@@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
 		float2 uv[3];
@@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 3229091bbb0..47778553b94 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -26,13 +26,13 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
 	const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
 	const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 
 	/* return normal */
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		return normalize(cross(v2 - v0, v1 - v0));
 	}
 	else {
@@ -110,34 +110,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y);
 		float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -153,24 +153,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float3 f0, f1, f2;
 
 		if(desc.element == ATTR_ELEMENT_CORNER) {
@@ -185,11 +185,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index 4db121d94f4..4d234dd62bd 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -457,7 +457,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #  endif
@@ -491,7 +491,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #  endif
@@ -519,7 +519,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -557,7 +557,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 03724c955be..1e0ef5201c9 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -64,7 +64,7 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
 
 ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_CUDA__
 #  if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
@@ -91,7 +91,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 
 ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_CUDA__
 #  if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 9279a94c13a..cd339e6237e 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -20,6 +20,7 @@
 /* CPU Kernel Interface */
 
 #include "util_types.h"
+#include "kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
 struct KernelGlobals;
+struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 5bcc57cdcdf..f18d145f7cf 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -54,7 +54,8 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
 	shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
-	/* TODO, disable the closures we won't need */
+	/* TODO, disable more closures we don't need besides transparent */
+	shader_bsdf_disable_transparency(kg, sd);
 
 #ifdef __BRANCHED_PATH__
 	if(!kernel_data.integrator.branched) {
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index dedac6b1465..0df5217d97a 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -457,7 +457,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
 		Transform tfm = kernel_data.cam.worldtondc;
@@ -467,7 +467,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(ccl_fetch(sd, object) != OBJECT_NONE)
+		if(sd->object != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 9d1f3bdc918..e347a1eca18 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -44,6 +44,15 @@
 
 #define ccl_addr_space
 
+#define ccl_local_id(d) 0
+#define ccl_global_id(d) (kg->global_id[d])
+
+#define ccl_local_size(d) 1
+#define ccl_global_size(d) (kg->global_size[d])
+
+#define ccl_group_id(d) ccl_global_id(d)
+#define ccl_num_groups(d) ccl_global_size(d)
+
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..37a9e8d2f84 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -46,6 +46,9 @@
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
 #define ccl_constant
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
@@ -60,6 +63,52 @@
 #include "util_half.h"
 #include "util_types.h"
 
+/* Work item functions */
+
+ccl_device_inline uint ccl_local_id(uint d)
+{
+	switch(d) {
+		case 0: return threadIdx.x;
+		case 1: return threadIdx.y;
+		case 2: return threadIdx.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+	switch(d) {
+		case 0: return blockDim.x;
+		case 1: return blockDim.y;
+		case 2: return blockDim.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+	switch(d) {
+		case 0: return blockIdx.x;
+		case 1: return blockIdx.y;
+		case 2: return blockIdx.z;
+		default: return 0;
+	}
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+	switch(d) {
+		case 0: return gridDim.x;
+		case 1: return gridDim.y;
+		case 2: return gridDim.z;
+		default: return 0;
+	}
+}
+
 /* Textures */
 
 typedef texture<float4, 1> texture_float4;
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index f076e3a7d37..6c963dea4f5 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -39,6 +39,7 @@
 #define ccl_constant __constant
 #define ccl_global __global
 #define ccl_local __local
+#define ccl_local_param __local
 #define ccl_private __private
 #define ccl_restrict restrict
 #define ccl_align(n) __attribute__((aligned(n)))
@@ -49,6 +50,15 @@
 #  define ccl_addr_space
 #endif
 
+#define ccl_local_id(d) get_local_id(d)
+#define ccl_global_id(d) get_global_id(d)
+
+#define ccl_local_size(d) get_local_size(d)
+#define ccl_global_size(d) get_global_size(d)
+
+#define ccl_group_id(d) get_group_id(d)
+#define ccl_num_groups(d) get_num_groups(d)
+
 /* Selective nodes compilation. */
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 8c7c651a053..bc2d9604122 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -67,7 +67,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		                         ls->shader, ls->object, ls->prim,
 		                         ls->u, ls->v, t, time, false, ls->lamp);
 
-		ls->Ng = ccl_fetch(emission_sd, Ng);
+		ls->Ng = emission_sd->Ng;
 
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
@@ -76,7 +76,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		path_state_modify_bounce(state, false);
 
 		/* evaluate emissive closure */
-		if(ccl_fetch(emission_sd, flag) & SD_EMISSION)
+		if(emission_sd->flag & SD_EMISSION)
 			eval = shader_emissive_eval(kg, emission_sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -112,7 +112,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	                                         -ls->D,
 	                                         dD,
 	                                         ls->t,
-	                                         ccl_fetch(sd, time));
+	                                         sd->time);
 
 	if(is_zero(light_eval))
 		return false;
@@ -120,7 +120,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	/* evaluate BSDF at shading point */
 
 #ifdef __VOLUME__
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
 	else {
 		float bsdf_pdf;
@@ -168,8 +168,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
-		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
+		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
 
 		if(ls->t == FLT_MAX) {
 			/* distant light */
@@ -182,7 +182,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = differential3_zero();
 	}
 	else {
@@ -204,14 +204,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
+		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 2b52a2d2f48..1c3884890bf 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -16,6 +16,9 @@
 
 /* Constant Globals */
 
+#ifndef __KERNEL_GLOBALS_H__
+#define __KERNEL_GLOBALS_H__
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -64,6 +67,13 @@ typedef struct KernelGlobals {
 	/* Storage for decoupled volume steps. */
 	VolumeStep *decoupled_volume_steps[2];
 	int decoupled_volume_steps_index;
+
+	/* split kernel */
+	SplitData split_data;
+	SplitParams split_param_data;
+
+	int2 global_size;
+	int2 global_id;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
@@ -103,8 +113,8 @@ typedef ccl_addr_space struct KernelGlobals {
 #  include "kernel_textures.h"
 
 #  ifdef __SPLIT_KERNEL__
-	ShaderData *sd_input;
-	Intersection *isect_shadow;
+	SplitData split_data;
+	SplitParams split_param_data;
 #  endif
 } KernelGlobals;
 
@@ -146,3 +156,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o
 
 CCL_NAMESPACE_END
 
+#endif  /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 7aec47e4957..ed523696571 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -19,16 +19,16 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
 {
 	ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	atomic_add_and_fetch_float(buf, value);
 #else
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -39,12 +39,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
 #else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -57,7 +57,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 #else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
@@ -75,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
+		if(!(sd->flag & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
 
 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, ccl_fetch(sd, P));
+					float depth = camera_distance(kg, sd->P);
 					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
 				}
 				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, ccl_fetch(sd, object));
+					float id = object_pass_id(kg, sd->object);
 					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
 				}
 				if(flag & PASS_MATERIAL_ID) {
@@ -96,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 			}
 
 			if(flag & PASS_NORMAL) {
-				float3 normal = ccl_fetch(sd, N);
+				float3 normal = sd->N;
 				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
 			}
 			if(flag & PASS_UV) {
@@ -127,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-		float depth = camera_distance(kg, ccl_fetch(sd, P));
+		float depth = camera_distance(kg, sd->P);
 		float mist = saturate((depth - mist_start)*mist_inv_depth);
 
 		/* falloff */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index f90701a8260..95c27850513 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -75,17 +75,17 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;
 
-		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+		light_ray.P = ray_offset(sd->P, sd->Ng);
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-		light_ray.time = ccl_fetch(sd, time);
+		light_ray.time = sd->time;
 #endif  /* __OBJECT_MOTION__ */
-		light_ray.dP = ccl_fetch(sd, dP);
+		light_ray.dP = sd->dP;
 		light_ray.dD = differential3_zero();
 
 		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
@@ -459,7 +459,7 @@ bool kernel_path_subsurface_scatter(
 #  ifdef __VOLUME__
 		ss_indirect->need_update_volume_stack =
 		        kernel_data.integrator.use_volumes &&
-		        ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
+		        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
 #  endif  /* __VOLUME__ */
 
 		/* compute lighting with the BSDF closure */
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index ff2b828795d..d58960cae4e 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -42,17 +42,17 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 			Ray light_ray;
 			float3 ao_shadow;
 
-			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.P = ray_offset(sd->P, sd->Ng);
 			light_ray.D = ao_D;
 			light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-			light_ray.time = ccl_fetch(sd, time);
+			light_ray.time = sd->time;
 #endif  /* __OBJECT_MOTION__ */
-			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.dP = sd->dP;
 			light_ray.dD = differential3_zero();
 
 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
@@ -67,8 +67,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSDF(sc->type))
 			continue;
@@ -140,8 +140,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                         Ray *ray,
                                                         float3 throughput)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSSRDF(sc->type))
 			continue;
@@ -169,7 +169,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 			Ray volume_ray = *ray;
 			bool need_update_volume_stack =
 			        kernel_data.integrator.use_volumes &&
-			        ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
 #endif  /* __VOLUME__ */
 
 			/* compute lighting with the BSDF closure */
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index fea503d06e5..34a78552c1d 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -25,7 +25,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
+	if(!(sd->flag & SD_BSDF_HAS_EVAL))
 		return;
 
 	Ray light_ray;
@@ -33,7 +33,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 	bool is_lamp;
 
 #  ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #  endif
 
 	if(sample_all_lights) {
@@ -52,7 +52,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 				float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
 
 				LightSample ls;
-				if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) {
+				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
 					/* The sampling probability returned by lamp_light_sample assumes that all lights were sampled.
 					 * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
@@ -87,7 +87,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+				if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
@@ -113,7 +113,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 		float terminate = path_state_rng_light_termination(kg, rng, state);
 
 		LightSample ls;
-		if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
@@ -156,15 +156,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	path_state_next(kg, state, label);
 
 	/* setup ray */
-	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 	ray->D = normalize(bsdf_omega_in);
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = ccl_fetch(sd, dP);
+	ray->dP = sd->dP;
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = ccl_fetch(sd, time);
+	ray->time = sd->time;
 #endif
 
 #ifdef __VOLUME__
@@ -195,7 +195,7 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 	PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
 		return;
 
 	/* sample illumination from lights to find path contribution */
@@ -208,11 +208,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #endif
 
 	LightSample ls;
-	if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 		float terminate = path_state_rng_light_termination(kg, rng, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
@@ -238,7 +238,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
                                            ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(ccl_fetch(sd, flag) & SD_BSDF) {
+	if(sd->flag & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
@@ -270,16 +270,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		path_state_next(kg, state, label);
 
 		/* setup ray */
-		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 		ray->D = normalize(bsdf_omega_in);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = bsdf_domega_in;
 #endif
 
@@ -291,21 +291,21 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
+	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
 		/* no surface shader but have a volume shader? act transparent */
 
 		/* update path state, count as transparent */
 		path_state_next(kg, state, LABEL_TRANSPARENT);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, -sd->Ng);
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 #endif
 
 		/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index cf5614b8a86..2e63909a38c 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -17,6 +17,8 @@
 #ifndef __KERNEL_QUEUE_H__
 #define __KERNEL_QUEUE_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Queue utility functions for split kernel
  */
@@ -35,7 +37,8 @@ ccl_device void enqueue_ray_index(
         ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
 {
 	/* This thread's queue index. */
-	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number])
+	                   + (queue_number * queue_size);
 	queues[my_queue_index] = ray_index;
 }
 
@@ -47,6 +50,7 @@ ccl_device void enqueue_ray_index(
  * is no more ray to allocate to other threads.
  */
 ccl_device int get_ray_index(
+        KernelGlobals *kg,
         int thread_index,       /* Global thread index. */
         int queue_number,       /* Queue to operate on. */
         ccl_global int *queues, /* Buffer of all queues. */
@@ -68,24 +72,25 @@ ccl_device void enqueue_ray_index_local(
         int queue_number,                            /* Queue in which to enqueue ray index. */
         char enqueue_flag,                           /* True for threads whose ray index has to be enqueued. */
         int queuesize,                               /* queue size. */
-        ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics. */
+        ccl_local_param unsigned int *local_queue_atomics,   /* To to local queue atomics. */
         ccl_global int *Queue_data,                  /* Queues. */
         ccl_global int *Queue_index)                 /* To do global queue atomics. */
 {
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
 
 	/* Get local queue id .*/
 	unsigned int lqidx;
 	if(enqueue_flag) {
-		lqidx = atomic_inc(local_queue_atomics);
+		lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue offset. */
 	if(lidx == 0) {
-		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+		*local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number],
+		                                                   *local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue index and enqueue ray. */
 	if(enqueue_flag) {
@@ -96,19 +101,19 @@ ccl_device void enqueue_ray_index_local(
 
 ccl_device unsigned int get_local_queue_index(
         int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-        ccl_local unsigned int *local_queue_atomics)
+        ccl_local_param unsigned int *local_queue_atomics)
 {
-	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
 	return my_lqidx;
 }
 
 ccl_device unsigned int get_global_per_queue_offset(
         int queue_number,
-        ccl_local unsigned int *local_queue_atomics,
+        ccl_local_param unsigned int *local_queue_atomics,
         ccl_global int* global_queue_atomics)
 {
-	unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number],
-	                                       local_queue_atomics[queue_number]);
+	unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number],
+	                                                        local_queue_atomics[queue_number]);
 	return queue_offset;
 }
 
@@ -116,10 +121,12 @@ ccl_device unsigned int get_global_queue_index(
     int queue_number,
     int queuesize,
     unsigned int lqidx,
-    ccl_local unsigned int * global_per_queue_offset)
+    ccl_local_param unsigned int * global_per_queue_offset)
 {
 	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
 	return my_gqidx;
 }
 
+CCL_NAMESPACE_END
+
 #endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index d0826e5e879..a2ab96b35e2 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN
 #ifdef __OBJECT_MOTION__
 ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_MOTION) {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
-		ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
+	if(sd->object_flag & SD_OBJECT_MOTION) {
+		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
+		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
 	}
 	else {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
-		ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+		sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	}
 }
 #endif
@@ -55,55 +55,55 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
                                                const Ray *ray)
 {
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 
-	ccl_fetch(sd, type) = isect->type;
-	ccl_fetch(sd, flag) = 0;
-	ccl_fetch(sd, object_flag) = kernel_tex_fetch(__object_flag,
-	                                              ccl_fetch(sd, object));
+	sd->type = isect->type;
+	sd->flag = 0;
+	sd->object_flag = kernel_tex_fetch(__object_flag,
+	                                              sd->object);
 
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	ccl_fetch(sd, time) = ray->time;
+	sd->time = ray->time;
 #endif
 
-	ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
-	ccl_fetch(sd, ray_length) = isect->t;
+	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+	sd->ray_length = isect->t;
 
 #ifdef __UV__
-	ccl_fetch(sd, u) = isect->u;
-	ccl_fetch(sd, v) = isect->v;
+	sd->u = isect->u;
+	sd->v = isect->v;
 #endif
 
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
 
-		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
-		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
+		sd->shader = __float_as_int(curvedata.z);
+		sd->P = bvh_curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* vectors */
-		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
-		ccl_fetch(sd, Ng) = Ng;
-		ccl_fetch(sd, N) = Ng;
+		sd->P = triangle_refine(kg, sd, isect, ray);
+		sd->Ng = Ng;
+		sd->N = Ng;
 		
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL)
+			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
 	}
 	else {
@@ -111,40 +111,40 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}
 
-	ccl_fetch(sd, I) = -ray->D;
+	sd->I = -ray->D;
 
-	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
+	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #  ifdef __DPDU__
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #  endif
 	}
 #endif
 
 	/* backfacing test */
-	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 	if(backfacing) {
-		ccl_fetch(sd, flag) |= SD_BACKFACING;
-		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+		sd->flag |= SD_BACKFACING;
+		sd->Ng = -sd->Ng;
+		sd->N = -sd->N;
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+		sd->dPdu = -sd->dPdu;
+		sd->dPdv = -sd->dPdv;
 #endif
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
-	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
-	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
+	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
+	differential_incoming(&sd->dI, ray->dD);
+	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
 }
 
@@ -249,106 +249,106 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 int lamp)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = P;
-	ccl_fetch(sd, N) = Ng;
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, I) = I;
-	ccl_fetch(sd, shader) = shader;
+	sd->P = P;
+	sd->N = Ng;
+	sd->Ng = Ng;
+	sd->I = I;
+	sd->shader = shader;
 	if(prim != PRIM_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE;
+		sd->type = PRIMITIVE_TRIANGLE;
 	else if(lamp != LAMP_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_LAMP;
+		sd->type = PRIMITIVE_LAMP;
 	else
-		ccl_fetch(sd, type) = PRIMITIVE_NONE;
+		sd->type = PRIMITIVE_NONE;
 
 	/* primitive */
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = object;
+	sd->object = object;
 #endif
 	/* currently no access to bvh prim index for strand sd->prim*/
-	ccl_fetch(sd, prim) = prim;
+	sd->prim = prim;
 #ifdef __UV__
-	ccl_fetch(sd, u) = u;
-	ccl_fetch(sd, v) = v;
+	sd->u = u;
+	sd->v = v;
 #endif
-	ccl_fetch(sd, ray_length) = t;
+	sd->ray_length = t;
 
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	ccl_fetch(sd, object_flag) = 0;
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		ccl_fetch(sd, object_flag) |= kernel_tex_fetch(__object_flag,
-		                                               ccl_fetch(sd, object));
+	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->object_flag = 0;
+	if(sd->object != OBJECT_NONE) {
+		sd->object_flag |= kernel_tex_fetch(__object_flag,
+		                                               sd->object);
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
-		ccl_fetch(sd, time) = time;
+		sd->time = time;
 	}
 	else if(lamp != LAMP_NONE) {
-		ccl_fetch(sd, ob_tfm)  = lamp_fetch_transform(kg, lamp, false);
-		ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true);
+		sd->ob_tfm  = lamp_fetch_transform(kg, lamp, false);
+		sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
 #endif
 	}
 
 	/* transform into world space */
 	if(object_space) {
-		object_position_transform_auto(kg, sd, &ccl_fetch(sd, P));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I));
+		object_position_transform_auto(kg, sd, &sd->P);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
+		sd->N = sd->Ng;
+		object_dir_transform_auto(kg, sd, &sd->I);
 	}
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
 
 #ifdef __INSTANCING__
-			if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
-				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+			if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+				object_normal_transform_auto(kg, sd, &sd->N);
 			}
 #endif
 		}
 
 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
 #  ifdef __INSTANCING__
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+			object_dir_transform_auto(kg, sd, &sd->dPdu);
+			object_dir_transform_auto(kg, sd, &sd->dPdv);
 		}
 #  endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}
 
 	/* backfacing test */
-	if(ccl_fetch(sd, prim) != PRIM_NONE) {
-		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	if(sd->prim != PRIM_NONE) {
+		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 		if(backfacing) {
-			ccl_fetch(sd, flag) |= SD_BACKFACING;
-			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+			sd->flag |= SD_BACKFACING;
+			sd->Ng = -sd->Ng;
+			sd->N = -sd->N;
 #ifdef __DPDU__
-			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+			sd->dPdu = -sd->dPdu;
+			sd->dPdv = -sd->dPdv;
 #endif
 		}
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	ccl_fetch(sd, dP) = differential3_zero();
-	ccl_fetch(sd, dI) = differential3_zero();
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = differential3_zero();
+	sd->dI = differential3_zero();
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -378,39 +378,39 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = ray->D;
-	ccl_fetch(sd, N) = -ray->D;
-	ccl_fetch(sd, Ng) = -ray->D;
-	ccl_fetch(sd, I) = -ray->D;
-	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	ccl_fetch(sd, object_flag) = 0;
+	sd->P = ray->D;
+	sd->N = -ray->D;
+	sd->Ng = -ray->D;
+	sd->I = -ray->D;
+	sd->shader = kernel_data.background.surface_shader;
+	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->object_flag = 0;
 #ifdef __OBJECT_MOTION__
-	ccl_fetch(sd, time) = ray->time;
+	sd->time = ray->time;
 #endif
-	ccl_fetch(sd, ray_length) = 0.0f;
+	sd->ray_length = 0.0f;
 
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = PRIM_NONE;
+	sd->object = PRIM_NONE;
 #endif
-	ccl_fetch(sd, prim) = PRIM_NONE;
+	sd->prim = PRIM_NONE;
 #ifdef __UV__
-	ccl_fetch(sd, u) = 0.0f;
-	ccl_fetch(sd, v) = 0.0f;
+	sd->u = 0.0f;
+	sd->v = 0.0f;
 #endif
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	ccl_fetch(sd, dP) = ray->dD;
-	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = ray->dD;
+	differential_incoming(&sd->dI, sd->dP);
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -505,11 +505,11 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+	for(int i = 0; i < sd->num_closure; i++) {
 		if(i == skip_bsdf)
 			continue;
 
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
@@ -535,8 +535,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
                                                         float light_pdf,
                                                         bool use_mis)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
@@ -591,22 +591,22 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 {
 	int sampled = 0;
 
-	if(ccl_fetch(sd, num_closure) > 1) {
+	if(sd->num_closure > 1) {
 		/* pick a BSDF closure based on sample weights */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+		for(sampled = 0; sampled < sd->num_closure; sampled++) {
+			const ShaderClosure *sc = &sd->closure[sampled];
 			
 			if(CLOSURE_IS_BSDF(sc->type))
 				sum += sc->sample_weight;
 		}
 
-		float r = ccl_fetch(sd, randb_closure)*sum;
+		float r = sd->randb_closure*sum;
 		sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+		for(sampled = 0; sampled < sd->num_closure; sampled++) {
+			const ShaderClosure *sc = &sd->closure[sampled];
 			
 			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
@@ -616,13 +616,13 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 			}
 		}
 
-		if(sampled == ccl_fetch(sd, num_closure)) {
+		if(sampled == sd->num_closure) {
 			*pdf = 0.0f;
 			return LABEL_NONE;
 		}
 	}
 
-	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+	const ShaderClosure *sc = &sd->closure[sampled];
 
 	int label;
 	float3 eval;
@@ -633,7 +633,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
 
-		if(ccl_fetch(sd, num_closure) > 1) {
+		if(sd->num_closure > 1) {
 			float sweight = sc->sample_weight;
 			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
 		}
@@ -660,8 +660,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
@@ -670,13 +670,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
 
 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
-	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
+	if(sd->flag & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@@ -685,6 +685,18 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 	return eval;
 }
 
+ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+{
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+			sc->sample_weight = 0.0f;
+			sc->weight = make_float3(0.0f, 0.0f, 0.0f);
+		}
+	}
+}
+
 ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 alpha = make_float3(1.0f, 1.0f, 1.0f) - shader_bsdf_transparency(kg, sd);
@@ -699,8 +711,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@@ -713,8 +725,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@@ -727,8 +739,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@@ -741,8 +753,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
@@ -756,8 +768,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
@@ -766,12 +778,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += ccl_fetch(sd, N)*average(sc->weight);
+			N += sd->N*average(sc->weight);
 		}
 	}
 
 	if(is_zero(N))
-		N = ccl_fetch(sd, N);
+		N = sd->N;
 	else
 		N = normalize(N);
 
@@ -786,8 +798,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			const Bssrdf *bssrdf = (const Bssrdf*)sc;
@@ -801,7 +813,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}
 
 	if(N_)
-		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
+		*N_ = (is_zero(N))? sd->N: normalize(N);
 
 	if(texture_blur_)
 		*texture_blur_ = texture_blur/weight_sum;
@@ -814,7 +826,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 
 ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
-	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
+	return emissive_simple_eval(sd->Ng, sd->I);
 }
 
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -822,8 +834,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 	float3 eval;
 	eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -838,8 +850,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@@ -853,9 +865,9 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng,
 	ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = randb;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = randb;
 
 #ifdef __OSL__
 	if(kg->osl)
@@ -869,13 +881,13 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
 		                                             sizeof(DiffuseBsdf),
 		                                             make_float3(0.8f, 0.8f, 0.8f));
-		bsdf->N = ccl_fetch(sd, N);
-		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf);
+		bsdf->N = sd->N;
+		sd->flag |= bsdf_diffuse_setup(bsdf);
 #endif
 	}
 
-	if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) {
-		ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953);
+	if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+		sd->lcg_state = lcg_state_init_addrspace(rng, state, 0xb4bc3953);
 	}
 }
 
@@ -884,9 +896,9 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
 	ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = 0.0f;
 
 #ifdef __SVM__
 #ifdef __OSL__
@@ -901,8 +913,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BACKGROUND(sc->type))
 			eval += sc->weight;
@@ -1081,9 +1093,9 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = 0.0f;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 06a77a208cb..2483c5f9ae1 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -45,7 +45,7 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
 	/* Setup shader data at surface. */
 	shader_setup_from_ray(kg, shadow_sd, isect, ray);
 	/* Attenuation from transparent surface. */
-	if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) {
+	if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
 		path_state_modify_bounce(state, true);
 		shader_eval_surface(kg,
 		                    shadow_sd,
@@ -180,7 +180,7 @@ ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
 				return true;
 			}
 			/* Move ray forward. */
-			ray->P = ccl_fetch(shadow_sd, P);
+			ray->P = shadow_sd->P;
 			if(ray->t != FLT_MAX) {
 				ray->D = normalize_len(Pend - ray->P, &ray->t);
 			}
@@ -248,7 +248,7 @@ ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
 }
 #  endif  /* __SHADOW_RECORD_ALL__ */
 
-#  ifdef __KERNEL_GPU__
+#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
 /* Shadow function to compute how much light is blocked,
  *
  * Here we raytrace from one transparent surface to the next step by step.
@@ -308,7 +308,7 @@ ccl_device bool shadow_blocked_transparent_stepped_loop(
 				return true;
 			}
 			/* Move ray forward. */
-			ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
+			ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
 			if(ray->t != FLT_MAX) {
 				ray->D = normalize_len(Pend - ray->P, &ray->t);
 			}
@@ -359,7 +359,7 @@ ccl_device bool shadow_blocked_transparent_stepped(
 	                                               shadow);
 }
 
-#  endif  /* __KERNEL_GPU__ */
+#  endif  /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
 #endif /* __TRANSPARENT_SHADOWS__ */
 
 ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
@@ -374,7 +374,7 @@ ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
 #ifdef __SPLIT_KERNEL__
 	Ray private_ray = *ray_input;
 	Ray *ray = &private_ray;
-	Intersection *isect = &kg->isect_shadow[SD_THREAD];
+	Intersection *isect = &kernel_split_state.isect_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
 #else  /* __SPLIT_KERNEL__ */
 	Ray *ray = ray_input;
 	Intersection isect_object;
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 52c05b85aee..a8fa6432542 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -298,20 +298,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	for(int hit = 0; hit < num_eval_hits; hit++) {
 		/* Quickly retrieve P and Ng without setting up ShaderData. */
 		float3 hit_P;
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+		if(sd->type & PRIMITIVE_TRIANGLE) {
 			hit_P = triangle_refine_subsurface(kg,
 			                                   sd,
 			                                   &ss_isect->hits[hit],
 			                                   ray);
 		}
 #ifdef __OBJECT_MOTION__
-		else  if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) {
+		else  if(sd->type & PRIMITIVE_MOTION_TRIANGLE) {
 			float3 verts[3];
 			motion_triangle_vertices(
 			        kg,
-			        ccl_fetch(sd, object),
+			        sd->object,
 			        kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-			        ccl_fetch(sd, time),
+			        sd->time,
 			        verts);
 			hit_P = motion_triangle_refine_subsurface(kg,
 			                                          sd,
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 8d5bb75a428..cb1a3f40dee 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -32,6 +32,7 @@ KERNEL_TEX(uint, texture_uint, __prim_visibility)
 KERNEL_TEX(uint, texture_uint, __prim_index)
 KERNEL_TEX(uint, texture_uint, __prim_object)
 KERNEL_TEX(uint, texture_uint, __object_node)
+KERNEL_TEX(float2, texture_float2, __prim_time)
 
 /* objects */
 KERNEL_TEX(float4, texture_float4, __objects)
@@ -177,7 +178,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
 
 #  else
 /* bindless textures */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index f518530106c..a7faaef89ca 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -32,6 +32,11 @@
 #  define ccl_addr_space
 #endif
 
+#if defined(__SPLIT_KERNEL__) && !defined(__COMPUTE_DEVICE_GPU__)
+/* TODO(mai): need to investigate how this effects the kernel, as cpu kernel crashes without this right now */
+#define __COMPUTE_DEVICE_GPU__
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* constants */
@@ -56,6 +61,8 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE		16
 
+#define WORK_POOL_SIZE 64
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
@@ -63,28 +70,36 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
+#  ifndef __SPLIT_KERNEL__
+#    define __BRANCHED_PATH__
+#  endif
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __SUBSURFACE__
+#  ifndef __SPLIT_KERNEL__
+#    define __SUBSURFACE__
+#  endif
 #  define __CMJ__
-#  define __VOLUME__
-#  define __VOLUME_DECOUPLED__
-#  define __VOLUME_SCATTER__
-#  define __SHADOW_RECORD_ALL__
-#  define __VOLUME_RECORD_ALL__
+#  ifndef __SPLIT_KERNEL__
+#    define __VOLUME__
+#    define __VOLUME_DECOUPLED__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
+#    define __VOLUME_RECORD_ALL__
+#  endif
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
-#  define __VOLUME__
-#  define __VOLUME_SCATTER__
-#  define __SUBSURFACE__
-#  define __CMJ__
-#  define __SHADOW_RECORD_ALL__
+#  ifndef __SPLIT_KERNEL__
+#    define __BRANCHED_PATH__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SUBSURFACE__
+#    define __CMJ__
+#    define __SHADOW_RECORD_ALL__
+#  endif
 #endif  /* __KERNEL_CUDA__ */
 
 #ifdef __KERNEL_OPENCL__
@@ -798,99 +813,77 @@ enum ShaderDataObjectFlag {
 	                   SD_OBJECT_INTERSECTS_VOLUME)
 };
 
-#ifdef __SPLIT_KERNEL__
-#  define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0))
-#  if !defined(__SPLIT_KERNEL_SOA__)
-     /* ShaderData is stored as an Array-of-Structures */
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (s[SD_THREAD].soa_##t)
-#    define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index])
-#  else
-     /* ShaderData is stored as an Structure-of-Arrays */
-#    define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1))
-#    define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t)
-#    define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0)
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) +  SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t)
-#    define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index])
-#  endif
-#else
-#  define ccl_soa_member(type, name) type name
-#  define ccl_fetch(s, t) (s->t)
-#  define ccl_fetch_array(s, t, index) (&s->t[index])
-#endif
-
 typedef ccl_addr_space struct ShaderData {
 	/* position */
-	ccl_soa_member(float3, P);
+	float3 P;
 	/* smooth normal for shading */
-	ccl_soa_member(float3, N);
+	float3 N;
 	/* true geometric normal */
-	ccl_soa_member(float3, Ng);
+	float3 Ng;
 	/* view/incoming direction */
-	ccl_soa_member(float3, I);
+	float3 I;
 	/* shader id */
-	ccl_soa_member(int, shader);
+	int shader;
 	/* booleans describing shader, see ShaderDataFlag */
-	ccl_soa_member(int, flag);
+	int flag;
 	/* booleans describing object of the shader, see ShaderDataObjectFlag */
-	ccl_soa_member(int, object_flag);
+	int object_flag;
 
 	/* primitive id if there is one, ~0 otherwise */
-	ccl_soa_member(int, prim);
+	int prim;
 
 	/* combined type and curve segment for hair */
-	ccl_soa_member(int, type);
+	int type;
 
 	/* parametric coordinates
 	 * - barycentric weights for triangles */
-	ccl_soa_member(float, u);
-	ccl_soa_member(float, v);
+	float u;
+	float v;
 	/* object id if there is one, ~0 otherwise */
-	ccl_soa_member(int, object);
+	int object;
 
 	/* motion blur sample time */
-	ccl_soa_member(float, time);
+	float time;
 
 	/* length of the ray being shaded */
-	ccl_soa_member(float, ray_length);
+	float ray_length;
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differential of P. these are orthogonal to Ng, not N */
-	ccl_soa_member(differential3, dP);
+	differential3 dP;
 	/* differential of I */
-	ccl_soa_member(differential3, dI);
+	differential3 dI;
 	/* differential of u, v */
-	ccl_soa_member(differential, du);
-	ccl_soa_member(differential, dv);
+	differential du;
+	differential dv;
 #endif
 #ifdef __DPDU__
 	/* differential of P w.r.t. parametric coordinates. note that dPdu is
 	 * not readily suitable as a tangent for shading on triangles. */
-	ccl_soa_member(float3, dPdu);
-	ccl_soa_member(float3, dPdv);
+	float3 dPdu;
+	float3 dPdv;
 #endif
 
 #ifdef __OBJECT_MOTION__
 	/* object <-> world space transformations, cached to avoid
 	 * re-interpolating them constantly for shading */
-	ccl_soa_member(Transform, ob_tfm);
-	ccl_soa_member(Transform, ob_itfm);
+	Transform ob_tfm;
+	Transform ob_itfm;
 #endif
 
 	/* Closure data, we store a fixed array of closures */
-	ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]);
-	ccl_soa_member(int, num_closure);
-	ccl_soa_member(int, num_closure_extra);
-	ccl_soa_member(float, randb_closure);
-	ccl_soa_member(float3, svm_closure_weight);
+	struct ShaderClosure closure[MAX_CLOSURE];
+	int num_closure;
+	int num_closure_extra;
+	float randb_closure;
+	float3 svm_closure_weight;
 
 	/* LCG state for closures that require additional random numbers. */
-	ccl_soa_member(uint, lcg_state);
+	uint lcg_state;
 
 	/* ray start position, only set for backgrounds */
-	ccl_soa_member(float3, ray_P);
-	ccl_soa_member(differential3, ray_dP);
+	float3 ray_P;
+	differential3 ray_dP;
 
 #ifdef __OSL__
 	struct KernelGlobals *osl_globals;
@@ -1202,7 +1195,8 @@ typedef struct KernelBVH {
 	int have_curves;
 	int have_instancing;
 	int use_qbvh;
-	int pad1, pad2;
+	int use_bvh_steps;
+	int pad1;
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index c7cb29b5af2..10d0d185345 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -966,7 +966,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
 		}
 	}
-	if(sample_t < 1e-6f) {
+	if(sample_t < 1e-6f || pdf == 0.0f) {
 		return VOLUME_PATH_SCATTERED;
 	}
 
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 7d559b1aa31..28fc5ce1c30 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -17,177 +17,102 @@
 #ifndef __KERNEL_WORK_STEALING_H__
 #define __KERNEL_WORK_STEALING_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Utility functions for work stealing
  */
 
-#ifdef __WORK_STEALING__
-
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-uint get_group_id_with_ray_index(uint ray_index,
-                                 uint tile_dim_x,
-                                 uint tile_dim_y,
-                                 uint parallel_samples,
-                                 int dim)
+ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
+{
+	return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
+}
+
+ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
+{
+	return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
+{
+	return ray_index / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
 {
-	if(dim == 0) {
-		uint x_span = ray_index % (tile_dim_x * parallel_samples);
-		return x_span / get_local_size(0);
+	uint total_work_size = kernel_total_work_size(kg);
+	uint num_pools = kernel_num_work_pools(kg);
+
+	if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
+		return 0;
+	}
+
+	uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
+
+	uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
+	if(work_pool < remainder / WORK_POOL_SIZE) {
+		work_size += WORK_POOL_SIZE;
 	}
-	else /*if(dim == 1)*/ {
-		kernel_assert(dim == 1);
-		uint y_span = ray_index / (tile_dim_x * parallel_samples);
-		return y_span / get_local_size(1);
+	else if(work_pool == remainder / WORK_POOL_SIZE) {
+		work_size += remainder % WORK_POOL_SIZE;
 	}
+
+	return work_size;
 }
 
-uint get_total_work(uint tile_dim_x,
-                    uint tile_dim_y,
-                    uint grp_idx,
-                    uint grp_idy,
-                    uint num_samples)
+ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
 {
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	return threads_within_tile_border_x *
-	       threads_within_tile_border_y *
-	       num_samples;
+	uint num_pools = kernel_num_work_pools(kg);
+	uint pool = work_pool_from_ray_index(kg, ray_index);
+
+	return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
+	       + (pool * WORK_POOL_SIZE)
+	       + (work_index % WORK_POOL_SIZE);
 }
 
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-int get_next_work(ccl_global uint *work_pool,
-                  ccl_private uint *my_work,
-                  uint tile_dim_x,
-                  uint tile_dim_y,
-                  uint num_samples,
-                  uint parallel_samples,
-                  uint ray_index)
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint total_work = get_total_work(tile_dim_x,
-	                                 tile_dim_y,
-	                                 grp_idx,
-	                                 grp_idy,
-	                                 num_samples);
-	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
-	*my_work = atomic_inc(&work_pool[group_index]);
-	return (*my_work < total_work) ? 1 : 0;
+	uint work_pool = work_pool_from_ray_index(kg, ray_index);
+	uint pool_size = work_pool_work_size(kg, work_pool);
+
+	if(pool_size == 0) {
+		return false;
+	}
+
+	*work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
+	return (*work_index < pool_size);
 }
 
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-uint get_my_sample(uint my_work,
-                   uint tile_dim_x,
-                   uint tile_dim_y,
-                   uint parallel_samples,
-                   uint ray_index)
+/* This function assumes that the passed `work` is valid. */
+/* Decode sample number w.r.t. assigned `work`. */
+ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	return my_work /
-	       (threads_within_tile_border_x * threads_within_tile_border_y);
+	return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
 }
 
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-void get_pixel_tile_position(ccl_private uint *pixel_x,
+/* Decode pixel and tile position w.r.t. assigned `work`. */
+ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
+                             ccl_private uint *pixel_x,
                              ccl_private uint *pixel_y,
                              ccl_private uint *tile_x,
                              ccl_private uint *tile_y,
-                             uint my_work,
-                             uint tile_dim_x,
-                             uint tile_dim_y,
-                             uint tile_offset_x,
-                             uint tile_offset_y,
-                             uint parallel_samples,
+                             uint work_index,
                              uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	uint total_associated_pixels =
-		threads_within_tile_border_x * threads_within_tile_border_y;
-	uint work_group_pixel_index = my_work % total_associated_pixels;
-	uint work_group_pixel_x =
-		work_group_pixel_index % threads_within_tile_border_x;
-	uint work_group_pixel_y =
-		work_group_pixel_index / threads_within_tile_border_x;
-
-	*pixel_x =
-		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
-	*pixel_y =
-		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
-	*tile_x = *pixel_x - tile_offset_x;
-	*tile_y = *pixel_y - tile_offset_y;
+	uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+
+	*tile_x = pixel_index % kernel_split_params.w;
+	*tile_y = pixel_index / kernel_split_params.w;
+
+	*pixel_x = *tile_x + kernel_split_params.x;
+	*pixel_y = *tile_y + kernel_split_params.y;
 }
 
-#endif  /* __WORK_STEALING__ */
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 1a07c705f1c..deb872444d0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -49,4 +49,39 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        int offset,
                                        int sample);
 
+/* Split kernels */
+
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+        KernelGlobals *kg,
+        ccl_constant KernelData *data,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        ccl_global uint *rng_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer);
+
+#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data);
+
+DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
+DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DECLARE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+
+void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
+
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index ec82d4b4c22..d6d0db4e034 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -21,17 +21,39 @@
  */
 
 #include "kernel_compat_cpu.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_cpu_image.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_path_branched.h"
-#include "kernel_bake.h"
+
+#ifndef __SPLIT_KERNEL__
+#  include "kernel_math.h"
+#  include "kernel_types.h"
+
+#  include "split/kernel_split_data.h"
+#  include "kernel_globals.h"
+
+#  include "kernel_cpu_image.h"
+#  include "kernel_film.h"
+#  include "kernel_path.h"
+#  include "kernel_path_branched.h"
+#  include "kernel_bake.h"
+#else
+#  include "split/kernel_split_common.h"
+
+#  include "split/kernel_data_init.h"
+#  include "split/kernel_path_init.h"
+#  include "split/kernel_scene_intersect.h"
+#  include "split/kernel_lamp_emission.h"
+#  include "split/kernel_queue_enqueue.h"
+#  include "split/kernel_background_buffer_update.h"
+#  include "split/kernel_shader_eval.h"
+#  include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#  include "split/kernel_direct_lighting.h"
+#  include "split/kernel_shadow_blocked.h"
+#  include "split/kernel_next_iteration_setup.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
+#ifndef __SPLIT_KERNEL__
+
 /* Path Tracing */
 
 void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
@@ -131,4 +153,55 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 	}
 }
 
+#else  /* __SPLIT_KERNEL__ */
+
+/* Split Kernel Path Tracing */
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		kernel_##name(kg); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+
+void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
+{
+#define REGISTER_NAME_STRING(name) #name
+#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
+#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
+
+	REGISTER(path_trace);
+	REGISTER(convert_to_byte);
+	REGISTER(convert_to_half_float);
+	REGISTER(shader);
+
+	REGISTER(data_init);
+	REGISTER(path_init);
+	REGISTER(scene_intersect);
+	REGISTER(lamp_emission);
+	REGISTER(queue_enqueue);
+	REGISTER(background_buffer_update);
+	REGISTER(shader_eval);
+	REGISTER(holdout_emission_blurring_pathtermination_ao);
+	REGISTER(direct_lighting);
+	REGISTER(shadow_blocked);
+	REGISTER(next_iteration_setup);
+
+#undef REGISTER
+#undef REGISTER_EVAL_NAME
+#undef REGISTER_NAME_STRING
+}
+
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
new file mode 100644
index 00000000000..30519dae53e
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel_cpu_impl.h"
+
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
new file mode 100644
index 00000000000..335ad24bdc5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#  define __KERNEL_AVX__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  include "kernel.h"
+#  define KERNEL_ARCH cpu_avx
+#  include "kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
new file mode 100644
index 00000000000..765ba96aba3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE__
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#  define __KERNEL_AVX__
+#  define __KERNEL_AVX2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  include "kernel.h"
+#  define KERNEL_ARCH cpu_avx2
+#  include "kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
new file mode 100644
index 00000000000..af244c03929
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  include "kernel.h"
+#  define KERNEL_ARCH cpu_sse2
+#  include "kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
new file mode 100644
index 00000000000..d1b579eeac5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  include "kernel.h"
+#  define KERNEL_ARCH cpu_sse3
+#  include "kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
new file mode 100644
index 00000000000..83d62de5aa5
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  include "kernel.h"
+#  define KERNEL_ARCH cpu_sse41
+#  include "kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 090ab2c50c2..52e541321e3 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,7 +16,10 @@
 
 /* CUDA kernel entry points */
 
+#ifdef __CUDA_ARCH__
+
 #include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
 #include "../../kernel_math.h"
 #include "../../kernel_types.h"
 #include "../../kernel_globals.h"
@@ -25,104 +28,7 @@
 #include "../../kernel_path_branched.h"
 #include "../../kernel_bake.h"
 
-/* device data taken from CUDA occupancy calculator */
-
-#ifdef __CUDA_ARCH__
-
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 32
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 48
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-	__launch_bounds__( \
-		threads_block_width*threads_block_width, \
-		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
-		)
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
 /* kernels */
-
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
new file mode 100644
index 00000000000..9fa39dc9ebb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 32
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 48
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
new file mode 100644
index 00000000000..759475b175f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "../../split/kernel_split_common.h"
+#include "../../split/kernel_data_init.h"
+#include "../../split/kernel_path_init.h"
+#include "../../split/kernel_scene_intersect.h"
+#include "../../split/kernel_lamp_emission.h"
+#include "../../split/kernel_queue_enqueue.h"
+#include "../../split/kernel_background_buffer_update.h"
+#include "../../split/kernel_shader_eval.h"
+#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "../../split/kernel_direct_lighting.h"
+#include "../../split/kernel_shadow_blocked.h"
+#include "../../split/kernel_next_iteration_setup.h"
+
+#include "../../kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_state_buffer_size(uint num_threads, uint *size)
+{
+	*size = split_data_buffer_size(NULL, num_threads);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        ccl_global uint *rng_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer)
+{
+	kernel_data_init(NULL,
+	                 NULL,
+	                 split_data_buffer,
+	                 num_elements,
+	                 ray_state,
+	                 rng_state,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_pool_wgs,
+	                 num_samples,
+	                 buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		kernel_##name(NULL); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index a68f97857b6..52406d2f548 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -67,8 +67,8 @@ __kernel void kernel_ocl_path_trace(
 	kg->name = name;
 #include "../../kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
@@ -96,7 +96,7 @@ __kernel void kernel_ocl_shader(
 	kg->name = name;
 #include "../../kernel_textures.h"
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
 		kernel_shader_evaluate(kg,
@@ -128,7 +128,7 @@ __kernel void kernel_ocl_bake(
 	kg->name = name;
 #include "../../kernel_textures.h"
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
 #ifdef __NO_BAKING__
@@ -159,8 +159,8 @@ __kernel void kernel_ocl_convert_to_byte(
 	kg->name = name;
 #include "../../kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
@@ -186,11 +186,27 @@ __kernel void kernel_ocl_convert_to_half_float(
 	kg->name = name;
 #include "../../kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset)
+{
+	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	if(i < size / sizeof(float4)) {
+		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	}
+	else if(i == size / sizeof(float4)) {
+		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
+
+		for(i = 0; i < size % sizeof(float4); i++) {
+			*(b++) = 0;
+		}
+	}
+}
+
 #endif  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
index 1914d241eb1..47e363f6e03 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
@@ -14,112 +14,13 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_background_buffer_update.h"
 
 __kernel void kernel_ocl_path_trace_background_buffer_update(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        ccl_global int *Queue_data,            /* Queues memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(ray_index == 0) {
-		/* We will empty this queue in this kernel. */
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-	}
-	char enqueue_flag = 0;
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          1);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag =
-			kernel_background_buffer_update((KernelGlobals *)kg,
-			                                per_sample_output_buffers,
-			                                rng_state,
-			                                rng_coop,
-			                                throughput_coop,
-			                                PathRadiance_coop,
-			                                Ray_coop,
-			                                PathState_coop,
-			                                L_transparent_coop,
-			                                ray_state,
-			                                sw, sh, sx, sy, stride,
-			                                rng_state_offset_x,
-			                                rng_state_offset_y,
-			                                rng_state_stride,
-			                                work_array,
-			                                end_sample,
-			                                start_sample,
-#ifdef __WORK_STEALING__
-			                                work_pool_wgs,
-			                                num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-			                                debugdata_coop,
-#endif
-			                                parallel_samples,
-			                                ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	 * These rays will be made active during next SceneIntersectkernel.
-	 */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
+	kernel_background_buffer_update(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 18139687eab..1e3c4fa28c7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -14,77 +14,49 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_data_init.h"
 
 __kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *globals,
-        ccl_global char *sd_DL_shadow,
+        KernelGlobals *kg,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
         ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
 
 #define KERNEL_TEX(type, ttype, name)                                   \
         ccl_global type *name,
 #include "../../kernel_textures.h"
 
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
         ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
         unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global float *buffer)
 {
-	kernel_data_init((KernelGlobals *)globals,
-	                 (ShaderData *)sd_DL_shadow,
+	kernel_data_init(kg,
 	                 data,
-	                 per_sample_output_buffers,
-	                 rng_state,
-	                 rng_coop,
-	                 throughput_coop,
-	                 L_transparent_coop,
-	                 PathRadiance_coop,
-	                 Ray_coop,
-	                 PathState_coop,
-	                 Intersection_coop_shadow,
+	                 split_data_buffer,
+	                 num_elements,
 	                 ray_state,
+	                 rng_state,
 
 #define KERNEL_TEX(type, ttype, name) name,
 #include "../../kernel_textures.h"
 
-	                 start_sample, sx, sy, sw, sh, offset, stride,
-	                 rng_state_offset_x,
-	                 rng_state_offset_y,
-	                 rng_state_stride,
-	                 Queue_data,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
 	                 Queue_index,
 	                 queuesize,
 	                 use_queues_flag,
-	                 work_array,
-#ifdef __WORK_STEALING__
 	                 work_pool_wgs,
 	                 num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-	                 debugdata_coop,
-#endif
-	                 parallel_samples);
+	                 buffer);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index c6a2c8d050c..5d2f46b319d 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -14,74 +14,13 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_direct_lighting.h"
 
 __kernel void kernel_ocl_path_trace_direct_lighting(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                    /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        ccl_global int *Queue_data,             /* Queue memory */
-        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
-        int queuesize)                          /* Size (capacity) of each queue */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg,
-		                                      (ShaderData *)sd,
-		                                      rng_coop,
-		                                      PathState_coop,
-		                                      ISLamp_coop,
-		                                      LightRay_coop,
-		                                      BSDFEval_coop,
-		                                      ray_state,
-		                                      ray_index);
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-#ifdef __EMISSION__
-	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-#endif
+	kernel_direct_lighting(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index e063614da1a..7724b8a0bdf 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -14,110 +14,13 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 
 __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics_bg;
-	ccl_local unsigned int local_queue_atomics_ao;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics_bg = 0;
-		local_queue_atomics_ao = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif  /* __COMPUTE_DEVICE_GPU__ */
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		kernel_holdout_emission_blurring_pathtermination_ao(
-		        (KernelGlobals *)kg,
-		        (ShaderData *)sd,
-		        per_sample_output_buffers,
-		        rng_coop,
-		        throughput_coop,
-		        L_transparent_coop,
-		        PathRadiance_coop,
-		        PathState_coop,
-		        Intersection_coop,
-		        AOAlpha_coop,
-		        AOBSDF_coop,
-		        AOLightRay_coop,
-		        sw, sh, sx, sy, stride,
-		        ray_state,
-		        work_array,
-#ifdef __WORK_STEALING__
-		        start_sample,
-#endif
-		        parallel_samples,
-		        ray_index,
-		        &enqueue_flag,
-		        &enqueue_flag_AO_SHADOW_RAY_CAST);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics_bg,
-	                        Queue_data,
-	                        Queue_index);
-
-#ifdef __AO__
-	/* Enqueue to-shadow-ray-cast rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-	                        enqueue_flag_AO_SHADOW_RAY_CAST,
-	                        queuesize,
-	                        &local_queue_atomics_ao,
-	                        Queue_data,
-	                        Queue_index);
-#endif
+	kernel_holdout_emission_blurring_pathtermination_ao(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 267bddc2ffc..2b84d0ea43e 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -14,67 +14,13 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_lamp_emission.h"
 
 __kernel void kernel_ocl_path_trace_lamp_emission(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	/* We will empty this queue in this kernel. */
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-	}
-	/* Fetch use_queues_flag. */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          1);
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_lamp_emission((KernelGlobals *)kg,
-	                     throughput_coop,
-	                     PathRadiance_coop,
-	                     Ray_coop,
-	                     PathState_coop,
-	                     Intersection_coop,
-	                     ray_state,
-	                     sw, sh,
-	                     use_queues_flag,
-	                     ray_index);
+	kernel_lamp_emission(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d49b6294a8..e87e367fb9c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -14,101 +14,13 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_next_iteration_setup.h"
 
 __kernel void kernel_ocl_path_trace_next_iteration_setup(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                  /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global int *Queue_data,           /* Queue memory */
-        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
-        int queuesize,                        /* Size (capacity) of each queue */
-        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		/* If we are here, then it means that scene-intersect kernel
-		* has already been executed atleast once. From the next time,
-		* scene-intersect kernel may operate on queues to fetch ray index
-		*/
-		use_queues_flag[0] = 1;
-
-		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-		 * previous kernel.
-		 */
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-	}
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg,
-		                                           (ShaderData *)sd,
-		                                           rng_coop,
-		                                           throughput_coop,
-		                                           PathRadiance_coop,
-		                                           Ray_coop,
-		                                           PathState_coop,
-		                                           LightRay_dl_coop,
-		                                           ISLamp_coop,
-		                                           BSDFEval_coop,
-		                                           LightRay_ao_coop,
-		                                           AOBSDF_coop,
-		                                           AOAlpha_coop,
-		                                           ray_state,
-		                                           use_queues_flag,
-		                                           ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
+	kernel_next_iteration_setup(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
new file mode 100644
index 00000000000..7e9e4a02529
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
+#include "split/kernel_path_init.h"
+
+__kernel void kernel_ocl_path_trace_path_init(
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_path_init(kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 3156dc255fb..9ceb6a5c3d8 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -14,93 +14,13 @@
  * limitations under the License.
  */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_queues.h"
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
+#include "split/kernel_queue_enqueue.h"
 
-/*
- * The kernel "kernel_queue_enqueue" enqueues rays of
- * different ray state into their appropriate Queues;
- * 1. Rays that have been determined to hit the background from the
- * "kernel_scene_intersect" kernel
- * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output of the kernel is as follows,
- *
- * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
- * queuesize -------------------------------------------|                           |
- *
- * Note on Queues :
- * State of queues during the first time this kernel is called :
- * At entry,
- * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
- *
- * State of queue during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
 __kernel void kernel_ocl_path_trace_queue_enqueue(
-        ccl_global int *Queue_data,   /* Queue memory */
-        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
-        ccl_global char *ray_state,   /* Denotes the state of each ray */
-        int queuesize)                /* Size (capacity) of each queue */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	/* We have only 2 cases (Hit/Not-Hit) */
-	ccl_local unsigned int local_queue_atomics[2];
-
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-	if(lidx < 2 ) {
-		local_queue_atomics[lidx] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int queue_number = -1;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-	}
-	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	}
-
-	unsigned int my_lqidx;
-	if(queue_number != -1) {
-		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(lidx == 0) {
-		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
-		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
-		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	unsigned int my_gqidx;
-	if(queue_number != -1) {
-		my_gqidx = get_global_queue_index(queue_number,
-		                                  queuesize,
-		                                  my_lqidx,
-		                                  local_queue_atomics);
-		Queue_data[my_gqidx] = ray_index;
-	}
+	kernel_queue_enqueue(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index 7f3f433c7a6..4e083e87d1c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -14,67 +14,13 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_scene_intersect.h"
 
 __kernel void kernel_ocl_path_trace_scene_intersect(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	/* Fetch use_queues_flag */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          0);
-
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_scene_intersect((KernelGlobals *)kg,
-	                       rng_coop,
-	                       Ray_coop,
-	                       PathState_coop,
-	                       Intersection_coop,
-	                       ray_state,
-	                       sw, sh,
-	                       use_queues_flag,
-#ifdef __KERNEL_DEBUG__
-	                       debugdata_coop,
-#endif
-	                       ray_index);
+	kernel_scene_intersect(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index c37856c8f30..a2b48b15928 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -14,55 +14,13 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_shader_eval.h"
 
 __kernel void kernel_ocl_path_trace_shader_eval(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global int *Queue_data,            /* queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-
-	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-
-	/* Continue on with shader evaluation. */
-	kernel_shader_eval((KernelGlobals *)kg,
-	                   (ShaderData *)sd,
-	                   rng_coop,
-	                   Ray_coop,
-	                   PathState_coop,
-	                   Intersection_coop,
-	                   ray_state,
-	                   ray_index);
+	kernel_shader_eval(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
index edf76fba714..3693f7f9c9d 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
@@ -14,52 +14,13 @@
  * limitations under the License.
  */
 
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
 #include "split/kernel_shadow_blocked.h"
 
 __kernel void kernel_ocl_path_trace_shadow_blocked(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
+        KernelGlobals *kg,
+        ccl_constant KernelData *data)
 {
-	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
-
-	ccl_local unsigned int ao_queue_length;
-	ccl_local unsigned int dl_queue_length;
-	if(lidx == 0) {
-		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	/* flag determining if the current ray is to process shadow ray for AO or DL */
-	char shadow_blocked_type = -1;
-
-	int ray_index = QUEUE_EMPTY_SLOT;
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(thread_index < ao_queue_length + dl_queue_length) {
-		if(thread_index < ao_queue_length) {
-			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
-		} else {
-			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
-		}
-	}
-
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-
-	kernel_shadow_blocked((KernelGlobals *)kg,
-	                      PathState_coop,
-	                      LightRay_dl_coop,
-	                      LightRay_ao_coop,
-	                      ray_state,
-	                      shadow_blocked_type,
-	                      ray_index);
+	kernel_shadow_blocked(kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
new file mode 100644
index 00000000000..0a1843ff8bd
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_compat_opencl.h"
+#include "split/kernel_split_common.h"
+
+__kernel void kernel_ocl_path_trace_state_buffer_size(
+        KernelGlobals *kg,
+        ccl_constant KernelData *data,
+        uint num_threads,
+        ccl_global uint *size)
+{
+	kg->data = data;
+	*size = split_data_buffer_size(kg, num_threads);
+}
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
deleted file mode 100644
index 88a1ed830af..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	kernel_sum_all_radiance(data,
-	                        buffer,
-	                        per_sample_output_buffer,
-	                        parallel_samples,
-	                        sw, sh, stride,
-	                        buffer_offset_x,
-	                        buffer_offset_y,
-	                        buffer_stride,
-	                        start_sample);
-}
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 3614717e28c..d3a69d39597 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -78,7 +78,7 @@ public:
 				bssrdf->albedo = albedo.x;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
 			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -89,7 +89,7 @@ public:
 				bssrdf->albedo = albedo.y;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
 			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -100,7 +100,7 @@ public:
 				bssrdf->albedo = albedo.z;
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 		}
 	}
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 94de782dca0..fe61587d179 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -42,6 +42,7 @@
 
 #include "kernel_types.h"
 #include "kernel_compat_cpu.h"
+#include "split/kernel_split_data_types.h"
 #include "kernel_globals.h"
 #include "kernel_montecarlo.h"
 #include "kernel_random.h"
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 58bbdc33920..b08353e82d1 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -39,6 +39,7 @@
 #include "util_string.h"
 
 #include "kernel_compat_cpu.h"
+#include "split/kernel_split_data_types.h"
 #include "kernel_globals.h"
 #include "kernel_random.h"
 #include "kernel_projection.h"
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 0d762bbdb38..c7e9f57b18a 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -19,6 +19,7 @@
 #include "kernel_compat_cpu.h"
 #include "kernel_montecarlo.h"
 #include "kernel_types.h"
+#include "split/kernel_split_data_types.h"
 #include "kernel_globals.h"
 
 #include "geom/geom_object.h"
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
index 9bfa71c75ef..04aaf1bbaad 100644
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_background_buffer_update kernel.
  * This is the fourth kernel in the ray tracing logic, and the third
@@ -69,80 +69,77 @@
  * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
  * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
  */
-ccl_device char kernel_background_buffer_update(
-        KernelGlobals *kg,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index)
+ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 {
+	ccl_local unsigned int local_queue_atomics;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
 	char enqueue_flag = 0;
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	ccl_global uint *rng_state = kernel_split_params.rng_state;
+	int stride = kernel_split_params.stride;
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
 #ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
+	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
 #endif
-	ccl_global PathState *state = &PathState_coop[ray_index];
-	PathRadiance *L = L = &PathRadiance_coop[ray_index];
-	ccl_global Ray *ray = &Ray_coop[ray_index];
-	ccl_global float3 *throughput = &throughput_coop[ray_index];
-	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
-	ccl_global uint *rng = &rng_coop[ray_index];
-
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
-	ccl_global float *initial_per_sample_output_buffers;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
+	ccl_global uint *rng = &kernel_split_state.rng[ray_index];
+	ccl_global float *buffer = kernel_split_params.buffer;
+
+	unsigned int work_index;
 	ccl_global uint *initial_rng;
-#endif
+
 	unsigned int sample;
 	unsigned int tile_x;
 	unsigned int tile_y;
 	unsigned int pixel_x;
 	unsigned int pixel_y;
-	unsigned int my_sample_tile;
 
-#ifdef __WORK_STEALING__
-	my_work = work_array[ray_index];
-	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-	get_pixel_tile_position(&pixel_x, &pixel_y,
+	work_index = kernel_split_state.work_array[ray_index];
+	sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
 	                        &tile_x, &tile_y,
-	                        my_work,
-	                        sw, sh, sx, sy,
-	                        parallel_samples,
+	                        work_index,
 	                        ray_index);
-	my_sample_tile = 0;
-	initial_per_sample_output_buffers = per_sample_output_buffers;
 	initial_rng = rng_state;
-#else  /* __WORK_STEALING__ */
-	sample = work_array[ray_index];
-	int tile_index = ray_index / parallel_samples;
-	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
-	tile_x = tile_index % sw;
-	tile_y = tile_index / sw;
-	my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-
-	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+	rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 		/* eval background shader if nothing hit */
@@ -157,7 +154,7 @@ ccl_device char kernel_background_buffer_update(
 		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 #ifdef __BACKGROUND__
 			/* sample background shader */
-			float3 L_background = indirect_background(kg, kg->sd_input, state, ray);
+			float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
 			path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
 #endif
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
@@ -166,55 +163,38 @@ ccl_device char kernel_background_buffer_update(
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
 		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+		kernel_write_light_passes(kg, buffer, L, sample);
 #ifdef __KERNEL_DEBUG__
-		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
 #endif
 		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
 
 		/* accumulate result in output buffer */
-		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+		kernel_write_pass_float4(buffer, sample, L_rad);
 		path_rng_end(kg, rng_state, *rng);
 
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
 		/* We have completed current work; So get next work */
-		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+		int valid_work = get_next_work(kg, &work_index, ray_index);
 		if(!valid_work) {
 			/* If work is invalid, this means no more work is available and the thread may exit */
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
 		}
-#else  /* __WORK_STEALING__ */
-		if((sample + parallel_samples) >= end_sample) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#endif  /* __WORK_STEALING__ */
 
 		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-			work_array[ray_index] = my_work;
+			kernel_split_state.work_array[ray_index] = work_index;
 			/* Get the sample associated with the current work */
-			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+			sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
 			/* Get pixel and tile position associated with current work */
-			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			my_sample_tile = 0;
+			get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
 
 			/* Remap rng_state according to the current work */
-			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
-			/* Remap per_sample_output_buffers according to the current work */
-			per_sample_output_buffers = initial_per_sample_output_buffers
-				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-#else  /* __WORK_STEALING__ */
-			work_array[ray_index] = sample + parallel_samples;
-			sample = work_array[ray_index];
-
-			/* Get ray position from ray index */
-			pixel_x = sx + ((ray_index / parallel_samples) % sw);
-			pixel_y = sy + ((ray_index / parallel_samples) / sw);
-#endif  /* __WORK_STEALING__ */
+			rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
+			/* Remap buffer according to the current work */
+			buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 			/* Initialize random numbers and ray. */
 			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
@@ -226,7 +206,7 @@ ccl_device char kernel_background_buffer_update(
 				*throughput = make_float3(1.0f, 1.0f, 1.0f);
 				*L_transparent = 0.0f;
 				path_radiance_init(L, kernel_data.film.use_light_pass);
-				path_state_init(kg, kg->sd_input, state, rng, sample, ray);
+				path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, rng, sample, ray);
 #ifdef __KERNEL_DEBUG__
 				debug_data_init(debug_data);
 #endif
@@ -237,12 +217,29 @@ ccl_device char kernel_background_buffer_update(
 				/* These rays do not participate in path-iteration. */
 				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				/* Accumulate result in output buffer. */
-				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+				kernel_write_pass_float4(buffer, sample, L_rad);
 				path_rng_end(kg, rng_state, *rng);
 
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 			}
 		}
 	}
-	return enqueue_flag;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 6e158d53d23..9b62d65ffd9 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -14,108 +14,105 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_data_initialization kernel
  * This kernel Initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
- *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- *
- * Its input and output are as follows,
- *
- * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
- * Un-initialized throughput -------|                                  |--- Initialized throughput
- * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
- * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
- * Un-initialized Ray --------------|                                  |--- Initialized Ray
- * Un-initialized PathState --------|                                  |--- Initialized PathState
- * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
- * Un-initialized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
- * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
- * Un-initialized ray_state --------|                                  |--- Initialized ray_state
- * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
- * rng_state -----------------------|                                  |--- Initialized work_array
- * data ----------------------------|                                  |--- Initialized work_pool_wgs
- * start_sample --------------------|                                  |
- * sx ------------------------------|                                  |
- * sy ------------------------------|                                  |
- * sw ------------------------------|                                  |
- * sh ------------------------------|                                  |
- * stride --------------------------|                                  |
- * queuesize -----------------------|                                  |
- * num_samples ---------------------|                                  |
  *
  * Note on Queues :
  * All slots in queues are initialized to queue empty slot;
  * The number of elements in the queues is initialized to 0;
  */
+
+/* distributes an amount of work across all threads
+ * note: work done inside the loop may not show up to all threads till after the current kernel has completed
+ */
+#define parallel_for(kg, iter_name, work_size) \
+	for(size_t _size = (work_size), \
+	    _global_size = ccl_global_size(0) * ccl_global_size(1), \
+	    _n = _size / _global_size, \
+		_thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \
+	    iter_name = (_n > 0) ? (_thread * _n) : (_thread) \
+		; \
+		(iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \
+		; \
+		iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \
+	)
+
+#ifndef __KERNEL_CPU__
 ccl_device void kernel_data_init(
+#else
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+#endif
         KernelGlobals *kg,
-        ShaderData *sd_DL_shadow,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
         ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
 
+#ifdef __KERNEL_OPENCL__
 #define KERNEL_TEX(type, ttype, name)                                   \
         ccl_global type *name,
 #include "../kernel_textures.h"
+#endif
 
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global unsigned int *work_pools,      /* Work pool for each work group */
+        unsigned int num_samples,
+        ccl_global float *buffer)
 {
+#ifdef __KERNEL_OPENCL__
 	kg->data = data;
-	kg->sd_input = sd_DL_shadow;
-	kg->isect_shadow = Intersection_coop_shadow;
+#endif
+
+	kernel_split_params.x = sx;
+	kernel_split_params.y = sy;
+	kernel_split_params.w = sw;
+	kernel_split_params.h = sh;
+
+	kernel_split_params.offset = offset;
+	kernel_split_params.stride = stride;
+
+	kernel_split_params.rng_state = rng_state;
+
+	kernel_split_params.start_sample = start_sample;
+	kernel_split_params.end_sample = end_sample;
+
+	kernel_split_params.work_pools = work_pools;
+	kernel_split_params.num_samples = num_samples;
+
+	kernel_split_params.queue_index = Queue_index;
+	kernel_split_params.queue_size = queuesize;
+	kernel_split_params.use_queues_flag = use_queues_flag;
+
+	kernel_split_params.buffer = buffer;
+
+	split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
+
+#ifdef __KERNEL_OPENCL__
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
 #include "../kernel_textures.h"
+#endif
 
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-#ifdef __WORK_STEALING__
-	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	/* Initialize work_pool_wgs */
-	if(lid == 0) {
-		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
-		work_pool_wgs[group_index] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-#endif  /* __WORK_STEALING__ */
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 
 	/* Initialize queue data and queue index. */
 	if(thread_index < queuesize) {
 		/* Initialize active ray queue. */
-		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize background and buffer update queue. */
-		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize shadow ray cast of AO queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize shadow ray cast of direct lighting queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 	}
 
 	if(thread_index == 0) {
@@ -126,109 +123,32 @@ ccl_device void kernel_data_init(
 		/* The scene-intersect kernel should not use the queues very first time.
 		 * since the queue would be empty.
 		 */
-		use_queues_flag[0] = 0;
+		*use_queues_flag = 0;
 	}
 
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+	/* zero the tiles pixels and initialize rng_state if this is the first sample */
+	if(start_sample == 0) {
+		parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
+			int pixel = i / kernel_data.film.pass_stride;
+			int pass = i % kernel_data.film.pass_stride;
 
-	if(x < (sw * parallel_samples) && y < sh) {
-		int ray_index = x + y * (sw * parallel_samples);
+			int x = sx + pixel % sw;
+			int y = sy + pixel / sw;
 
-		/* This is the first assignment to ray_state;
-		 * So we dont use ASSIGN_RAY_STATE macro.
-		 */
-		ray_state[ray_index] = RAY_ACTIVE;
-
-		unsigned int my_sample;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
-		unsigned int tile_x;
-		unsigned int tile_y;
-		unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-		unsigned int my_work = 0;
-		/* Get work. */
-		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		/* Get the sample associated with the work. */
-		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-
-		my_sample_tile = 0;
-
-		/* Get pixel and tile position associated with the work. */
-		get_pixel_tile_position(&pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
-		                        ray_index);
-		work_array[ray_index] = my_work;
-#else  /* __WORK_STEALING__ */
-		unsigned int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-		my_sample = my_sample_tile + start_sample;
-
-		/* Initialize work array. */
-		work_array[ray_index] = my_sample ;
-
-		/* Calculate pixel position of this ray. */
-		pixel_x = sx + tile_x;
-		pixel_y = sy + tile_y;
-#endif  /* __WORK_STEALING__ */
-
-		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-
-		/* Initialise per_sample_output_buffers to all zeros. */
-		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
-		int per_sample_output_buffers_iterator = 0;
-		for(per_sample_output_buffers_iterator = 0;
-		    per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
-		    per_sample_output_buffers_iterator++)
-		{
-			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
-		}
+			int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
 
-		/* Initialize random numbers and ray. */
-		kernel_path_trace_setup(kg,
-		                        rng_state,
-		                        my_sample,
-		                        pixel_x, pixel_y,
-		                        &rng_coop[ray_index],
-		                        &Ray_coop[ray_index]);
-
-		if(Ray_coop[ray_index].t != 0.0f) {
-			/* Initialize throughput, L_transparent, Ray, PathState;
-			 * These rays proceed with path-iteration.
-			 */
-			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-			L_transparent_coop[ray_index] = 0.0f;
-			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
-			path_state_init(kg,
-			                kg->sd_input,
-			                &PathState_coop[ray_index],
-			                &rng_coop[ray_index],
-			                my_sample,
-			                &Ray_coop[ray_index]);
-#ifdef __KERNEL_DEBUG__
-			debug_data_init(&debugdata_coop[ray_index]);
-#endif
-		}
-		else {
-			/* These rays do not participate in path-iteration. */
-			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			/* Accumulate result in output buffer. */
-			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
-			path_rng_end(kg, rng_state, rng_coop[ray_index]);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+			*(buffer + index) = 0.0f;
 		}
-	}
 
-	/* Mark rest of the ray-state indices as RAY_INACTIVE. */
-	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
-		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
-		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+		parallel_for(kg, i, sw * sh) {
+			int x = sx + i % sw;
+			int y = sy + i / sw;
+
+			int index = (offset + x + y*stride);
+			*(rng_state + index) = hash_int_2d(x, y);
+		}
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 82ca18829d3..5163b8edc04 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_direct_lighting kernel.
  * This is the eighth kernel in the ray tracing logic. This is the seventh
@@ -47,28 +47,50 @@
  * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
  * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
  */
-ccl_device char kernel_direct_lighting(
-        KernelGlobals *kg,
-        ShaderData *sd,                         /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_direct_lighting(KernelGlobals *kg)
 {
+	ccl_local unsigned int local_queue_atomics;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
 	char enqueue_flag = 0;
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global PathState *state = &PathState_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
 
 		/* direct lighting */
 #ifdef __EMISSION__
 		if((kernel_data.integrator.use_direct_light &&
-		    (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+		    (sd->flag & SD_BSDF_HAS_EVAL)))
 		{
 			/* Sample illumination from lights to find path contribution. */
-			ccl_global RNG* rng = &rng_coop[ray_index];
+			ccl_global RNG* rng = &kernel_split_state.rng[ray_index];
 			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
 			float light_u, light_v;
 			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
@@ -77,32 +99,48 @@ ccl_device char kernel_direct_lighting(
 			LightSample ls;
 			if(light_sample(kg,
 			                light_t, light_u, light_v,
-			                ccl_fetch(sd, time),
-			                ccl_fetch(sd, P),
+			                sd->time,
+			                sd->P,
 			                state->bounce,
 			                &ls)) {
 
 				Ray light_ray;
 #ifdef __OBJECT_MOTION__
-				light_ray.time = ccl_fetch(sd, time);
+				light_ray.time = sd->time;
 #endif
 
 				BsdfEval L_light;
 				bool is_lamp;
-				if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+				if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 					/* Write intermediate data to global memory to access from
 					 * the next kernel.
 					 */
-					LightRay_coop[ray_index] = light_ray;
-					BSDFEval_coop[ray_index] = L_light;
-					ISLamp_coop[ray_index] = is_lamp;
+					kernel_split_state.light_ray[ray_index] = light_ray;
+					kernel_split_state.bsdf_eval[ray_index] = L_light;
+					kernel_split_state.is_lamp[ray_index] = is_lamp;
 					/* Mark ray state for next shadow kernel. */
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+					ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 					enqueue_flag = 1;
 				}
 			}
 		}
 #endif  /* __EMISSION__ */
 	}
-	return enqueue_flag;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 5d951b972ed..7168efa59ae 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
  * This is the sixth kernel in the ray tracing logic. This is the fifth
@@ -70,101 +70,105 @@
  * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
  * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
  */
-ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
-        KernelGlobals *kg,
-        ShaderData *sd,                        /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index,
-        char *enqueue_flag,
-        char *enqueue_flag_AO_SHADOW_RAY_CAST)
+ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobals *kg)
 {
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
+	ccl_local unsigned int local_queue_atomics_bg;
+	ccl_local unsigned int local_queue_atomics_ao;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		local_queue_atomics_bg = 0;
+		local_queue_atomics_ao = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	int stride = kernel_split_params.stride;
+
+	unsigned int work_index;
 	unsigned int pixel_x;
 	unsigned int pixel_y;
-#endif
+
 	unsigned int tile_x;
 	unsigned int tile_y;
-	int my_sample_tile;
 	unsigned int sample;
 
 	ccl_global RNG *rng = 0x0;
 	ccl_global PathState *state = 0x0;
 	float3 throughput;
 
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ccl_global float *buffer = kernel_split_params.buffer;
+
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 
-		throughput = throughput_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		rng = &rng_coop[ray_index];
-#ifdef __WORK_STEALING__
-		my_work = work_array[ray_index];
-		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-		get_pixel_tile_position(&pixel_x, &pixel_y,
+		throughput = kernel_split_state.throughput[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+		rng = &kernel_split_state.rng[ray_index];
+
+		work_index = kernel_split_state.work_array[ray_index];
+		sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+		get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
 		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
+		                        work_index,
 		                        ray_index);
-		my_sample_tile = 0;
-#else  /* __WORK_STEALING__ */
-		sample = work_array[ray_index];
-		/* Buffer's stride is "stride"; Find x and y using ray_index. */
-		int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-		per_sample_output_buffers +=
-		    (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
-		    kernel_data.film.pass_stride;
+
+		buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
 
 		/* holdout */
 #ifdef __HOLDOUT__
-		if(((ccl_fetch(sd, flag) & SD_HOLDOUT) ||
-		    (ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK)) &&
+		if(((sd->flag & SD_HOLDOUT) ||
+		    (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
 		   (state->flag & PATH_RAY_CAMERA))
 		{
 			if(kernel_data.background.transparent) {
 				float3 holdout_weight;
-				if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
+				if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
 					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
 				}
 				else {
 					holdout_weight = shader_holdout_eval(kg, sd);
 				}
 				/* any throughput is ok, should all be identical here */
-				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+				kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
 			}
-			if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
+			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				*enqueue_flag = 1;
+				enqueue_flag = 1;
 			}
 		}
 #endif  /* __HOLDOUT__ */
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		PathRadiance *L = &PathRadiance_coop[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 		/* Holdout mask objects do not write data passes. */
 		kernel_write_data_passes(kg,
-		                         per_sample_output_buffers,
+		                         buffer,
 		                         L,
 		                         sd,
 		                         sample,
@@ -183,12 +187,12 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 #ifdef __EMISSION__
 		/* emission */
-		if(ccl_fetch(sd, flag) & SD_EMISSION) {
+		if(sd->flag & SD_EMISSION) {
 			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
 			float3 emission = indirect_primitive_emission(
 			        kg,
 			        sd,
-			        Intersection_coop[ray_index].t,
+			        kernel_split_state.isect[ray_index].t,
 			        state->flag,
 			        state->ray_pdf);
 			path_radiance_accum_emission(L, throughput, emission, state->bounce);
@@ -203,7 +207,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 		if(probability == 0.0f) {
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			*enqueue_flag = 1;
+			enqueue_flag = 1;
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
@@ -211,10 +215,10 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
 				if(terminate >= probability) {
 					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					*enqueue_flag = 1;
+					enqueue_flag = 1;
 				}
 				else {
-					throughput_coop[ray_index] = throughput/probability;
+					kernel_split_state.throughput[ray_index] = throughput/probability;
 				}
 			}
 		}
@@ -224,7 +228,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion ||
-		   (ccl_fetch(sd, flag) & SD_AO))
+		   (sd->flag & SD_AO))
 		{
 			/* todo: solve correlation */
 			float bsdf_u, bsdf_v;
@@ -232,29 +236,56 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
-			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+			kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+			kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
 
 			float3 ao_D;
 			float ao_pdf;
 			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 				Ray _ray;
-				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+				_ray.P = ray_offset(sd->P, sd->Ng);
 				_ray.D = ao_D;
 				_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-				_ray.time = ccl_fetch(sd, time);
+				_ray.time = sd->time;
 #endif
-				_ray.dP = ccl_fetch(sd, dP);
+				_ray.dP = sd->dP;
 				_ray.dD = differential3_zero();
-				AOLightRay_coop[ray_index] = _ray;
+				kernel_split_state.ao_light_ray[ray_index] = _ray;
 
 				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+				enqueue_flag_AO_SHADOW_RAY_CAST = 1;
 			}
 		}
 	}
 #endif  /* __AO__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &local_queue_atomics_bg,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        kernel_split_params.queue_size,
+	                        &local_queue_atomics_ao,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index 3bd0e361078..261625da31d 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_lamp_emission
  * This is the 3rd kernel in the ray-tracing logic. This is the second of the
@@ -36,28 +36,39 @@
  * sw -------------------------------------------------|                           |
  * sh -------------------------------------------------|                           |
  */
-ccl_device void kernel_lamp_emission(
-        KernelGlobals *kg,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int ray_index)
+ccl_device void kernel_lamp_emission(KernelGlobals *kg)
 {
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+	/* Fetch use_queues_flag. */
+	ccl_local char local_use_queues_flag;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          1);
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND))
 	{
-		PathRadiance *L = &PathRadiance_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-		float3 throughput = throughput_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
 
 #ifdef __LAMP_MIS__
 		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
@@ -65,7 +76,7 @@ ccl_device void kernel_lamp_emission(
 			Ray light_ray;
 
 			light_ray.P = ray.P - state->ray_t*ray.D;
-			state->ray_t += Intersection_coop[ray_index].t;
+			state->ray_t += kernel_split_state.isect[ray_index].t;
 			light_ray.D = ray.D;
 			light_ray.t = state->ray_t;
 			light_ray.time = ray.time;
@@ -74,10 +85,13 @@ ccl_device void kernel_lamp_emission(
 			/* intersect with lamp */
 			float3 emission;
 
-			if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) {
+			if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) {
 				path_radiance_accum_emission(L, throughput, emission, state->bounce);
 			}
 		}
 #endif  /* __LAMP_MIS__ */
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 816f3a6fbff..a6f26278116 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_setup_next_iteration kernel.
  * This is the tenth kernel in the ray tracing logic. This is the ninth
@@ -59,47 +59,76 @@
  * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
  * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
  */
-ccl_device char kernel_next_iteration_setup(
-        KernelGlobals *kg,
-        ShaderData *sd,                       /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
-        int ray_index)
+ccl_device void kernel_next_iteration_setup(KernelGlobals *kg)
 {
+	ccl_local unsigned int local_queue_atomics;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		*kernel_split_params.use_queues_flag = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
 	char enqueue_flag = 0;
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
 
 	/* Load ShaderData structure. */
 	PathRadiance *L = NULL;
 	ccl_global PathState *state = NULL;
+	ccl_global char *ray_state = kernel_split_state.ray_state;
 
 	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
 	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
 	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
 	{
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
-		float3 _throughput = throughput_coop[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+		L = &kernel_split_state.path_radiance[ray_index];
+		float3 _throughput = kernel_split_state.throughput[ray_index];
 
 		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			float3 shadow = LightRay_ao_coop[ray_index].P;
-			char update_path_radiance = LightRay_ao_coop[ray_index].t;
+			float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
+			// TODO(mai): investigate correctness here
+			char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
 			if(update_path_radiance) {
 				path_radiance_accum_ao(L,
 				                       _throughput,
-				                       AOAlpha_coop[ray_index],
-				                       AOBSDF_coop[ray_index],
+				                       kernel_split_state.ao_alpha[ray_index],
+				                       kernel_split_state.ao_bsdf[ray_index],
 				                       shadow,
 				                       state->bounce);
 			}
@@ -107,35 +136,50 @@ ccl_device char kernel_next_iteration_setup(
 		}
 
 		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-			float3 shadow = LightRay_dl_coop[ray_index].P;
-			char update_path_radiance = LightRay_dl_coop[ray_index].t;
+			float3 shadow = kernel_split_state.light_ray[ray_index].P;
+			// TODO(mai): investigate correctness here
+			char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
 			if(update_path_radiance) {
-				BsdfEval L_light = BSDFEval_coop[ray_index];
+				BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
 				path_radiance_accum_light(L,
 				                          _throughput,
 				                          &L_light,
 				                          shadow,
 				                          1.0f,
 				                          state->bounce,
-				                          ISLamp_coop[ray_index]);
+				                          kernel_split_state.is_lamp[ray_index]);
 			}
 			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 		}
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global float3 *throughput = &throughput_coop[ray_index];
-		ccl_global Ray *ray = &Ray_coop[ray_index];
-		ccl_global RNG *rng = &rng_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		ccl_global RNG *rng = &kernel_split_state.rng[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+		L = &kernel_split_state.path_radiance[ray_index];
 
 		/* Compute direct lighting and next bounce. */
-		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+		if(!kernel_path_surface_bounce(kg, rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 			enqueue_flag = 1;
 		}
 	}
 
-	return enqueue_flag;
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
new file mode 100644
index 00000000000..fe3c9e1e8a2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ */
+
+ccl_device void kernel_path_init(KernelGlobals *kg) {
+	int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	/* This is the first assignment to ray_state;
+	 * So we dont use ASSIGN_RAY_STATE macro.
+	 */
+	kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
+
+	unsigned int my_sample;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+	unsigned int tile_x;
+	unsigned int tile_y;
+
+	unsigned int work_index = 0;
+	/* Get work. */
+	if(!get_next_work(kg, &work_index, ray_index)) {
+		/* No more work, mark ray as inactive */
+		kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
+
+		return;
+	}
+
+	/* Get the sample associated with the work. */
+	my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+
+	/* Get pixel and tile position associated with the work. */
+	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
+	                             &tile_x, &tile_y,
+	                             work_index,
+	                             ray_index);
+	kernel_split_state.work_array[ray_index] = work_index;
+
+	ccl_global uint *rng_state = kernel_split_params.rng_state;
+	rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
+
+	ccl_global float *buffer = kernel_split_params.buffer;
+	buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
+
+	/* Initialize random numbers and ray. */
+	kernel_path_trace_setup(kg,
+	                        rng_state,
+	                        my_sample,
+	                        pixel_x, pixel_y,
+	                        &kernel_split_state.rng[ray_index],
+	                        &kernel_split_state.ray[ray_index]);
+
+	if(kernel_split_state.ray[ray_index].t != 0.0f) {
+		/* Initialize throughput, L_transparent, Ray, PathState;
+		 * These rays proceed with path-iteration.
+		 */
+		kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+		kernel_split_state.L_transparent[ray_index] = 0.0f;
+		path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
+		path_state_init(kg,
+		                &kernel_split_state.sd_DL_shadow[ray_index],
+		                &kernel_split_state.path_state[ray_index],
+		                &kernel_split_state.rng[ray_index],
+		                my_sample,
+		                &kernel_split_state.ray[ray_index]);
+#ifdef __KERNEL_DEBUG__
+		debug_data_init(&kernel_split_state.debug_data[ray_index]);
+#endif
+	}
+	else {
+		/* These rays do not participate in path-iteration. */
+		float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		/* Accumulate result in output buffer. */
+		kernel_write_pass_float4(buffer, my_sample, L_rad);
+		path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
new file mode 100644
index 00000000000..66aad705bd4
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/*
+ * The kernel "kernel_queue_enqueue" enqueues rays of
+ * different ray state into their appropriate Queues;
+ * 1. Rays that have been determined to hit the background from the
+ * "kernel_scene_intersect" kernel
+ * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output of the kernel is as follows,
+ *
+ * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
+ * queuesize -------------------------------------------|                           |
+ *
+ * Note on Queues :
+ * State of queues during the first time this kernel is called :
+ * At entry,
+ * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
+ *
+ * State of queue during other times this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+ccl_device void kernel_queue_enqueue(KernelGlobals *kg)
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	ccl_local unsigned int local_queue_atomics[2];
+
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	if(lidx == 0) {
+		local_queue_atomics[0] = 0;
+		local_queue_atomics[1] = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    local_queue_atomics,
+		                                    kernel_split_params.queue_index);
+		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    local_queue_atomics,
+		                                    kernel_split_params.queue_index);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  kernel_split_params.queue_size,
+		                                  my_lqidx,
+		                                  local_queue_atomics);
+		kernel_split_state.queue_data[my_gqidx] = ray_index;
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 2388580051f..a7e0c7692a2 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_scene_intersect kernel.
  * This is the second kernel in the ray tracing logic. This is the first
@@ -61,34 +61,41 @@
  * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
  */
 
-ccl_device void kernel_scene_intersect(
-        KernelGlobals *kg,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int ray_index)
+ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 {
+	/* Fetch use_queues_flag */
+	ccl_local char local_use_queues_flag;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
+
 	/* All regenerated rays become active here */
-	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
 
-	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
 		return;
 
 #ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
+	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
 #endif
-	Intersection *isect = &Intersection_coop[ray_index];
-	PathState state = PathState_coop[ray_index];
-	Ray ray = Ray_coop[ray_index];
+	Intersection *isect = &kernel_split_state.isect[ray_index];
+	PathState state = kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.ray[ray_index];
 
 	/* intersect scene */
 	uint visibility = path_state_ray_visibility(kg, &state);
@@ -96,7 +103,7 @@ ccl_device void kernel_scene_intersect(
 #ifdef __HAIR__
 	float difl = 0.0f, extmax = 0.0f;
 	uint lcg_state = 0;
-	RNG rng = rng_coop[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
 
 	if(kernel_data.bvh.have_curves) {
 		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
@@ -128,6 +135,9 @@ ccl_device void kernel_scene_intersect(
 		 * These rays undergo special processing in the
 		 * background_bufferUpdate kernel.
 		 */
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index cef64bf5f36..35ee19ddf1b 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_shader_eval kernel
  * This kernel is the 5th kernel in the ray tracing logic. This is
@@ -44,27 +44,51 @@
  * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
  * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
  */
-ccl_device void kernel_shader_eval(
-        KernelGlobals *kg,
-        ShaderData *sd,                        /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int ray_index)
+
+ccl_device void kernel_shader_eval(KernelGlobals *kg)
 {
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		Intersection *isect = &Intersection_coop[ray_index];
-		ccl_global uint *rng = &rng_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	ccl_local unsigned int local_queue_atomics;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+	/* Continue on with shader evaluation. */
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		Intersection *isect = &kernel_split_state.isect[ray_index];
+		ccl_global uint *rng = &kernel_split_state.rng[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
 
 		shader_setup_from_ray(kg,
-		                      sd,
+		                      &kernel_split_state.sd[ray_index],
 		                      isect,
 		                      &ray);
 		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
index 6153af47f96..d532c7cf55b 100644
--- a/intern/cycles/kernel/split/kernel_shadow_blocked.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
 /* Note on kernel_shadow_blocked kernel.
  * This is the ninth kernel in the ray tracing logic. This is the eighth
@@ -45,24 +45,47 @@
  * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
  * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
  */
-ccl_device void kernel_shadow_blocked(
-        KernelGlobals *kg,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        char shadow_blocked_type,
-        int ray_index)
+ccl_device void kernel_shadow_blocked(KernelGlobals *kg)
 {
+	int lidx = ccl_local_id(1) * ccl_local_id(0) + ccl_local_id(0);
+
+	ccl_local unsigned int ao_queue_length;
+	ccl_local unsigned int dl_queue_length;
+	if(lidx == 0) {
+		ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+		dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	/* flag determining if the current ray is to process shadow ray for AO or DL */
+	char shadow_blocked_type = -1;
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < ao_queue_length + dl_queue_length) {
+		if(thread_index < ao_queue_length) {
+			ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+			                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
+		} else {
+			ray_index = get_ray_index(kg, thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+			                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
+		}
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
 	/* Flag determining if we need to update L. */
 	char update_path_radiance = 0;
 
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
+	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
+	   IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
 	{
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
-		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ccl_global Ray *light_ray_dl_global = &kernel_split_state.light_ray[ray_index];
+		ccl_global Ray *light_ray_ao_global = &kernel_split_state.ao_light_ray[ray_index];
 
 		ccl_global Ray *light_ray_global =
 		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
@@ -71,7 +94,7 @@ ccl_device void kernel_shadow_blocked(
 
 		float3 shadow;
 		update_path_radiance = !(shadow_blocked(kg,
-		                                        kg->sd_input,
+		                                        &kernel_split_state.sd_DL_shadow[thread_index],
 		                                        state,
 		                                        light_ray_global,
 		                                        &shadow));
@@ -83,3 +106,6 @@ ccl_device void kernel_shadow_blocked(
 		light_ray_global->t = update_path_radiance;
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 2135ee22b2e..dd0c3f9c941 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,11 +17,23 @@
 #ifndef  __KERNEL_SPLIT_H__
 #define  __KERNEL_SPLIT_H__
 
-#include "kernel_compat_opencl.h"
 #include "kernel_math.h"
 #include "kernel_types.h"
+
+#include "kernel_split_data.h"
+
 #include "kernel_globals.h"
-#include "kernel_image_opencl.h"
+
+#ifdef __OSL__
+#  include "osl_shader.h"
+#endif
+
+#ifdef __KERNEL_OPENCL__
+#  include "kernel_image_opencl.h"
+#endif
+#ifdef __KERNEL_CPU__
+#  include "../kernels/cpu/kernel_cpu_image.h"
+#endif
 
 #include "util_atomic.h"
 
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
new file mode 100644
index 00000000000..5380c0c5de6
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_H__
+#define __KERNEL_SPLIT_DATA_H__
+
+#include "kernel_split_data_types.h"
+#include "kernel_globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline size_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	size_t size = 0;
+#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
+	size = size SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+	return size;
+}
+
+ccl_device_inline void split_data_init(KernelGlobals *kg,
+                                       ccl_global SplitData *split_data,
+                                       size_t num_elements,
+                                       ccl_global void *data,
+                                       ccl_global char *ray_state)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	ccl_global char *p = (ccl_global char*)data;
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+	split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16);
+	SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+	split_data->ray_state = ray_state;
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
new file mode 100644
index 00000000000..62e3ea45ae2
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
+#define __KERNEL_SPLIT_DATA_TYPES_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
+
+typedef struct SplitParams {
+	int x;
+	int y;
+	int w;
+	int h;
+
+	int offset;
+	int stride;
+
+	ccl_global uint *rng_state;
+
+	int start_sample;
+	int end_sample;
+
+	ccl_global unsigned int *work_pools;
+	unsigned int num_samples;
+
+	ccl_global int *queue_index;
+	int queue_size;
+	ccl_global char *use_queues_flag;
+
+	ccl_global float *buffer;
+} SplitParams;
+
+/* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+
+/* SPLIT_DATA_ENTRY(type, name, num) */
+
+#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__)
+/* DebugData memory */
+#  define SPLIT_DATA_DEBUG_ENTRIES \
+	SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
+#else
+#  define SPLIT_DATA_DEBUG_ENTRIES
+#endif
+
+#define SPLIT_DATA_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(Intersection, isect_shadow, 2) \
+	SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
+	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 2) \
+	SPLIT_DATA_DEBUG_ENTRIES \
+
+/* struct that holds pointers to data in the shared state buffer */
+typedef struct SplitData {
+#define SPLIT_DATA_ENTRY(type, name, num) type *name;
+	SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+	/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
+	 * the host easily) but is still used the same as the other data so we have it here in this struct as well
+	 */
+	ccl_global char *ray_state;
+} SplitData;
+
+#ifndef __KERNEL_CUDA__
+#  define kernel_split_state (kg->split_data)
+#  define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+#  define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+#  define kernel_split_params (__split_param_data)
+#endif  /* __KERNEL_CUDA__ */
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644
index a21e9b6a0b1..00000000000
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../kernel_compat_opencl.h"
-#include "../kernel_math.h"
-#include "../kernel_types.h"
-#include "../kernel_globals.h"
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-ccl_device void kernel_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	if(x < sw && y < sh) {
-		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
-		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
-
-		int sample_stride = (data->film.pass_stride);
-
-		int sample_iterator = 0;
-		int pass_stride_iterator = 0;
-		int num_floats = data->film.pass_stride;
-
-		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
-			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
-				*(buffer + pass_stride_iterator) =
-				        (start_sample == 0 && sample_iterator == 0)
-				                ? *(per_sample_output_buffer + pass_stride_iterator)
-				                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
-			}
-			per_sample_output_buffer += sample_stride;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 88ec7fe6fcc..57ec9f94a3d 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -192,7 +192,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = ccl_fetch(sd, shader) & SHADER_MASK;
+	int offset = sd->shader & SHADER_MASK;
 
 	while(1) {
 		uint4 node = read_node(kg, &offset);
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 0e55c99ae97..229a3f20421 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData
 
 	AttributeDescriptor desc;
 
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+	if(sd->object != OBJECT_NONE) {
 		desc = find_attribute(kg, sd, node.y);
 		if(desc.offset == ATTR_STD_NOT_FOUND) {
 			desc = attribute_not_found();
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index 04a8c7b64e5..610d9af9e1f 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* save state */
-	stack_store_float3(stack, offset+0, ccl_fetch(sd, P));
-	stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx);
-	stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy);
+	stack_store_float3(stack, offset+0, sd->P);
+	stack_store_float3(stack, offset+3, sd->dP.dx);
+	stack_store_float3(stack, offset+6, sd->dP.dy);
 
 	/* set state as if undisplaced */
 	const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED);
@@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa
 		object_dir_transform(kg, sd, &dPdx);
 		object_dir_transform(kg, sd, &dPdy);
 
-		ccl_fetch(sd, P) = P;
-		ccl_fetch(sd, dP).dx = dPdx;
-		ccl_fetch(sd, dP).dy = dPdy;
+		sd->P = P;
+		sd->dP.dx = dPdx;
+		sd->dP.dy = dPdy;
 	}
 }
 
 ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* restore state */
-	ccl_fetch(sd, P) = stack_load_float3(stack, offset+0);
-	ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3);
-	ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6);
+	sd->P = stack_load_float3(stack, offset+0);
+	sd->dP.dx = stack_load_float3(stack, offset+3);
+	sd->dP.dy = stack_load_float3(stack, offset+6);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 00678a49d70..90249dfd978 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;
 
 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, ccl_fetch(sd, P));
+	vector = transform_point(&tfm, sd->P);
 	zdepth = vector.z;
 	distance = len(vector);
 
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 017d697f9f8..1885e1af851 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = eta;
-			ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+			sd->flag |= bsdf_refraction_setup(bsdf);
 		}
 		else {
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = 0.0f;
-			ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+			sd->flag |= bsdf_reflection_setup(bsdf);
 		}
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
@@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 	}
 	else {
 		bsdf->alpha_x = roughness;
@@ -50,9 +50,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 	}
 }
 
@@ -70,14 +70,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	if(mix_weight == 0.0f)
 		return;
 
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N;
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
 
 	switch(type) {
 		case CLOSURE_BSDF_DIFFUSE_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
 
 			if(bsdf) {
@@ -86,31 +86,31 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				float roughness = param1;
 
 				if(roughness == 0.0f) {
-					ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
+					sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
 				}
 				else {
 					bsdf->roughness = roughness;
-					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf);
+					sd->flag |= bsdf_oren_nayar_setup(bsdf);
 				}
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSLUCENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
-				ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf);
+				sd->flag |= bsdf_translucent_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSPARENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 			if(bsdf) {
-				ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+				sd->flag |= bsdf_transparent_setup(bsdf);
 			}
 			break;
 		}
@@ -123,7 +123,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -135,21 +135,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+					sd->flag |= bsdf_reflection_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
 					kernel_assert(stack_valid(data_node.z));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.z);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf);
+						sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf);
 			}
 
 			break;
@@ -161,7 +161,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -169,7 +169,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->extra = NULL;
 
 				float eta = fmaxf(param2, 1e-5f);
-				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -177,7 +177,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->alpha_y = 0.0f;
 					bsdf->ior = eta;
 
-					ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+					sd->flag |= bsdf_refraction_setup(bsdf);
 				}
 				else {
 					bsdf->alpha_x = param1;
@@ -185,9 +185,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->ior = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 					else
-						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 				}
 			}
 
@@ -203,14 +203,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 			}
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 			/* fresnel */
-			float cosNO = dot(N, ccl_fetch(sd, I));
+			float cosNO = dot(N, sd->I);
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;
 
@@ -249,7 +249,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 
@@ -261,13 +261,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->alpha_x = param1;
 				bsdf->alpha_y = param1;
 				float eta = fmaxf(param2, 1e-5f);
-				bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				kernel_assert(stack_valid(data_node.z));
 				bsdf->extra->color = stack_load_float3(stack, data_node.z);
 
 				/* setup bsdf */
-				ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
+				sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
 			}
 
 			break;
@@ -280,7 +280,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -310,33 +310,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->ior = 0.0f;
 
 				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) {
 					kernel_assert(stack_valid(data_node.w));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.w);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
+						sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
 
 				bsdf->sigma = saturate(param1);
-				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf);
+				sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf);
 			}
 			break;
 		}
@@ -346,7 +346,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 #endif
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight);
 
 			if(bsdf) {
@@ -355,18 +355,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->smooth = param2;
 				
 				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf);
+					sd->flag |= bsdf_diffuse_toon_setup(bsdf);
 				else
-					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf);
+					sd->flag |= bsdf_glossy_toon_setup(bsdf);
 			}
 			break;
 		}
 #ifdef __HAIR__
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			
-			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 				if(bsdf) {
@@ -376,7 +376,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					 * better figure out a way to skip backfaces from rays
 					 * spawned by transmission from the front */
 					bsdf->weight = make_float3(1.0f, 1.0f, 1.0f);
-					ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+					sd->flag |= bsdf_transparent_setup(bsdf);
 				}
 			}
 			else {
@@ -390,18 +390,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					if(stack_valid(data_node.y)) {
 						bsdf->T = normalize(stack_load_float3(stack, data_node.y));
 					}
-					else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
-						bsdf->T = normalize(ccl_fetch(sd, dPdv));
+					else if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
+						bsdf->T = normalize(sd->dPdv);
 						bsdf->offset = 0.0f;
 					}
 					else
-						bsdf->T = normalize(ccl_fetch(sd, dPdu));
+						bsdf->T = normalize(sd->dPdu);
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf);
+						sd->flag |= bsdf_hair_reflection_setup(bsdf);
 					}
 					else {
-						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf);
+						sd->flag |= bsdf_hair_transmission_setup(bsdf);
 					}
 				}
 			}
@@ -414,8 +414,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID:
 		case CLOSURE_BSSRDF_BURLEY_ID: {
-			float3 albedo = ccl_fetch(sd, svm_closure_weight);
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 albedo = sd->svm_closure_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
 			/* disable in case of diffuse ancestor, can't see it well then and
@@ -441,7 +441,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.x;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
 				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -452,7 +452,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.y;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
 				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -463,7 +463,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.z;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 			}
 
@@ -493,21 +493,21 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 
 	switch(type) {
 		case CLOSURE_VOLUME_ABSORPTION_ID: {
-			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density;
+			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sd->svm_closure_weight) * mix_weight * density;
 			ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight);
 
 			if(sc) {
-				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
+				sd->flag |= volume_absorption_setup(sc);
 			}
 			break;
 		}
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density;
+			float3 weight = sd->svm_closure_weight * mix_weight * density;
 			HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight);
 
 			if(volume) {
 				volume->g = param2; /* g */
-				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume);
+				sd->flag |= volume_henyey_greenstein_setup(volume);
 			}
 			break;
 		}
@@ -527,12 +527,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_EMISSION;
+	sd->flag |= SD_EMISSION;
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -545,10 +545,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight);
 }
 
 ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
@@ -561,12 +561,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_HOLDOUT;
+	sd->flag |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -579,19 +579,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_AO;
+	sd->flag |= SD_AO;
 }
 
 /* Closure Nodes */
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	ccl_fetch(sd, svm_closure_weight) = weight;
+	sd->svm_closure_weight = weight;
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -641,7 +641,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	ccl_fetch(sd, N) = normal;
+	sd->N = normal;
 	stack_store_float3(stack, out_normal, normal);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 890ab41aaaa..c94fa130af7 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -25,10 +25,10 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	uint normal_offset, distance_offset, invert, use_object_space;
 	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space);
 
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 
-	float3 dPdx = ccl_fetch(sd, dP).dx;
-	float3 dPdy = ccl_fetch(sd, dP).dy;
+	float3 dPdx = sd->dP.dx;
+	float3 dPdy = sd->dP.dy;
 
 	if(use_object_space) {
 		object_inverse_normal_transform(kg, sd, &normal_in);
@@ -80,14 +80,14 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, flo
 {
 	float d = stack_load_float(stack, fac_offset);
 
-	float3 dP = ccl_fetch(sd, N);
+	float3 dP = sd->N;
 	object_inverse_normal_transform(kg, sd, &dP);
 
 	dP *= d*0.1f; /* todo: get rid of this factor */
 
 	object_dir_transform(kg, sd, &dP);
 
-	ccl_fetch(sd, P) += dP;
+	sd->P += dP;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 23c97d80cb0..3703ec55015 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
-	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 
 	stack_store_float(stack, out_offset, f);
 }
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
 
 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
 
 	float f;
 
 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
 
-		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(ccl_fetch(sd, I), normal_in));
+		f = fabsf(dot(sd->I, normal_in));
 
 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 7d512f7ff4d..4a09d9f6653 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -27,15 +27,15 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
-		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
+		case NODE_GEOM_P: data = sd->P; break;
+		case NODE_GEOM_N: data = sd->N; break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
-		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
+		case NODE_GEOM_I: data = sd->I; break;
+		case NODE_GEOM_Ng: data = sd->Ng; break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
 #endif
 	}
 
@@ -48,8 +48,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -65,8 +65,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -87,9 +87,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
 		default: data = 0.0f; break;
 	}
 
@@ -106,44 +106,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg,
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
+			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
@@ -177,7 +177,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = ccl_fetch(sd, curve_transparency);
+			data = sd->curve_transparency;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 2afdf61b476..76acc9253a1 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -144,7 +144,6 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
 		case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
 		case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
-		case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
 		default:
 			kernel_assert(0);
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -238,9 +237,9 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = ccl_fetch(sd, N);
+	float3 N = sd->N;
 
-	N = ccl_fetch(sd, N);
+	N = sd->N;
 	object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 04f6f623f18..1492e358608 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -31,8 +31,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = sd->ray_length; break;
 		case NODE_LP_ray_depth: info = (float)state->bounce; break;
 		case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break;
 		case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break;
@@ -56,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 
 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
 	}
 
 	float smooth = stack_load_float(stack, smooth_offset);
 
 	if(smooth > 0.0f) {
-		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
+		float squared = sd->ray_length*sd->ray_length;
 		/* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */
 		if(isfinite(squared)) {
 			strength *= squared/(smooth + squared);
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 62ff38cf1c5..0347ab7b193 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -18,50 +18,42 @@ CCL_NAMESPACE_BEGIN
 
 /* Noise */
 
-ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color)
-{
-	int hard = 0;
-
-	if(distortion != 0.0f) {
-		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
-
-		r.x = noise(p + offset) * distortion;
-		r.y = noise(p) * distortion;
-		r.z = noise(p - offset) * distortion;
-
-		p += r;
-	}
-
-	*fac = noise_turbulence(p, detail, hard);
-	*color = make_float3(*fac,
-		noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
-		noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
-}
-
 ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint co_offset, scale_offset, detail_offset, distortion_offset, fac_offset, color_offset;
 
 	decode_node_uchar4(node.y, &co_offset, &scale_offset, &detail_offset, &distortion_offset);
+	decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
 
 	uint4 node2 = read_node(kg, offset);
 
 	float scale = stack_load_float_default(stack, scale_offset, node2.x);
 	float detail = stack_load_float_default(stack, detail_offset, node2.y);
 	float distortion = stack_load_float_default(stack, distortion_offset, node2.z);
-	float3 co = stack_load_float3(stack, co_offset);
+	float3 p = stack_load_float3(stack, co_offset) * scale;
+	int hard = 0;
 
-	float3 color;
-	float f;
+	if(distortion != 0.0f) {
+		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
+
+		r.x = noise(p + offset) * distortion;
+		r.y = noise(p) * distortion;
+		r.z = noise(p - offset) * distortion;
 
-	svm_noise(co*scale, detail, distortion, &f, &color);
+		p += r;
+	}
 
-	decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
+	float f = noise_turbulence(p, detail, hard);
 
-	if(stack_valid(fac_offset))
+	if(stack_valid(fac_offset)) {
 		stack_store_float(stack, fac_offset, f);
-	if(stack_valid(color_offset))
+	}
+	if(stack_valid(color_offset)) {
+		float3 color = make_float3(f,
+			noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
+			noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
 		stack_store_float3(stack, color_offset, color);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index c0b01262212..c94327401f5 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -48,47 +48,47 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P));
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
+				data = transform_point(&tfm, sd->P + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
+				data = camera_world_to_ndc(kg, sd, sd->P);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -112,9 +112,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -129,47 +129,47 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dx);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -196,9 +196,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -213,47 +213,47 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dy);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -274,12 +274,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 	float3 color = stack_load_float3(stack, color_offset);
 	color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f);
 
-	bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0;
+	bool is_backfacing = (sd->flag & SD_BACKFACING) != 0;
 	float3 N;
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(ccl_fetch(sd, object) == OBJECT_NONE) {
+		if(sd->object == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@@ -299,11 +299,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL);
 		float3 normal;
 
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL);
 		}
 		else {
-			normal = ccl_fetch(sd, Ng);
+			normal = sd->Ng;
 
 			/* the normal is already inverted, which is too soon for the math here */
 			if(is_backfacing) {
@@ -345,11 +345,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
+		N = safe_normalize(sd->N + (N - sd->N)*strength);
 	}
 
 	if(is_zero(N)) {
-		N = ccl_fetch(sd, N);
+		N = sd->N;
 	}
 
 	stack_store_float3(stack, normal_offset, N);
@@ -377,7 +377,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;
 
 		if(desc.offset == ATTR_STD_NOT_FOUND)
-			generated = ccl_fetch(sd, P);
+			generated = sd->P;
 		else
 			generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
@@ -390,7 +390,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}
 
 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
+	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
 	stack_store_float3(stack, tangent_offset, tangent);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 4c32130d06d..4e92f27acdb 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
+	bool is_object = (sd->object != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 87e40791333..3c6353c8001 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
                                   float3 *P)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
+	if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		/* Triangles */
 		int np = 3;
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
+		if(sd->type & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, sd->prim, Co);
 		else
-			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
+			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
 
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
@@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
-			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
+			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
@@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
 	 * With OpenCL 2.0 it's possible to avoid this change, but for until
 	 * then we'll be living with such an exception.
 	 */
-	float3 P = ccl_fetch(sd, P);
+	float3 P = sd->P;
 	float f = wireframe(kg, sd, size, pixel_size, &P);
 #else
-	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+	float f = wireframe(kg, sd, size, pixel_size, &sd->P);
 #endif
 
 	/* TODO(sergey): Think of faster way to calculate derivatives. */
 	if(bump_offset == NODE_BUMP_OFFSET_DX) {
-		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
+		float3 Px = sd->P - sd->dP.dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
 	}
 	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
-		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
+		float3 Py = sd->P - sd->dP.dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
 	}
 
 	if(stack_valid(out_fac))
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index 8029c6a9e80..deb22c9c2f2 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -30,7 +30,7 @@ class Shader;
 
 class Background : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	float ao_factor;
 	float ao_distance;
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index d9a297002c6..c2f6293a50b 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -171,9 +171,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 		/* needs to be up to data for attribute access */
 		device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-		device->mem_alloc(d_input, MEM_READ_ONLY);
+		device->mem_alloc("bake_input", d_input, MEM_READ_ONLY);
 		device->mem_copy_to(d_input);
-		device->mem_alloc(d_output, MEM_READ_WRITE);
+		device->mem_alloc("bake_output", d_output, MEM_READ_WRITE);
 
 		DeviceTask task(DeviceTask::SHADER);
 		task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 25f5eb3c897..aed9c5a8e75 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -73,7 +73,7 @@ public:
 
 	bool need_update;
 
-	int total_pixel_samples;
+	size_t total_pixel_samples;
 
 private:
 	BakeData *m_bake_data;
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index f1692712d61..e3ef4bf13fb 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -129,13 +129,13 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	
 	/* allocate buffer */
 	buffer.resize(params.width*params.height*params.get_passes_size());
-	device->mem_alloc(buffer, MEM_READ_WRITE);
+	device->mem_alloc("render_buffer", buffer, MEM_READ_WRITE);
 	device->mem_zero(buffer);
 
 	/* allocate rng state */
 	rng_state.resize(params.width, params.height);
 
-	device->mem_alloc(rng_state, MEM_READ_WRITE);
+	device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
 }
 
 bool RenderBuffers::copy_from_device()
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 141ef9cccef..655d74e42d8 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -39,7 +39,7 @@ class Scene;
 
 class Camera : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	/* Specifies an offset for the shutter's time interval. */
 	enum MotionPosition {
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 9fa51c51f52..d917057ed91 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -53,7 +53,7 @@ public:
 
 class Film : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	float exposure;
 	array<Pass> passes;
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 780fdf49ca4..06524d3fa13 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -201,14 +201,14 @@ public:
 /* Node definition utility macros */
 
 #define SHADER_NODE_CLASS(type) \
-	NODE_DECLARE; \
+	NODE_DECLARE \
 	type(); \
 	virtual ShaderNode *clone() const { return new type(*this); } \
 	virtual void compile(SVMCompiler& compiler); \
 	virtual void compile(OSLCompiler& compiler); \
 
 #define SHADER_NODE_NO_CLONE_CLASS(type) \
-	NODE_DECLARE; \
+	NODE_DECLARE \
 	type(); \
 	virtual void compile(SVMCompiler& compiler); \
 	virtual void compile(OSLCompiler& compiler); \
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index fd8a1262208..8985431b68a 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -285,9 +285,8 @@ int ImageManager::add_image(const string& filename,
 
 	thread_scoped_lock device_lock(device_mutex);
 
-	/* Do we have a float? */
-	if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
-		is_float = true;
+	/* Check whether it's a float texture. */
+	is_float = (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4);
 
 	/* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */
 	if((type == IMAGE_DATA_TYPE_FLOAT ||
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 27fff4831e5..3ce41d5a185 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -29,7 +29,7 @@ class Scene;
 
 class Integrator : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	int min_bounce;
 	int max_bounce;
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 6a4557506c3..fc6790dc022 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -57,9 +57,9 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
+	device->mem_alloc("shade_background_pixels_input", d_input, MEM_READ_ONLY);
 	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+	device->mem_alloc("shade_background_pixels_output", d_output, MEM_WRITE_ONLY);
 
 	DeviceTask main_task(DeviceTask::SHADER);
 	main_task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index c42b32919d4..b7660297f3e 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -1873,9 +1873,14 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 		dscene->prim_object.reference((uint*)&pack.prim_object[0], pack.prim_object.size());
 		device->tex_alloc("__prim_object", dscene->prim_object);
 	}
+	if(pack.prim_time.size()) {
+		dscene->prim_time.reference((float2*)&pack.prim_time[0], pack.prim_time.size());
+		device->tex_alloc("__prim_time", dscene->prim_time);
+	}
 
 	dscene->data.bvh.root = pack.root_index;
 	dscene->data.bvh.use_qbvh = scene->params.use_qbvh;
+	dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
 }
 
 void MeshManager::device_update_flags(Device * /*device*/,
@@ -2152,6 +2157,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	device->tex_free(dscene->prim_visibility);
 	device->tex_free(dscene->prim_index);
 	device->tex_free(dscene->prim_object);
+	device->tex_free(dscene->prim_time);
 	device->tex_free(dscene->tri_shader);
 	device->tex_free(dscene->tri_vnormal);
 	device->tex_free(dscene->tri_vindex);
@@ -2173,6 +2179,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->prim_visibility.clear();
 	dscene->prim_index.clear();
 	dscene->prim_object.clear();
+	dscene->prim_time.clear();
 	dscene->tri_shader.clear();
 	dscene->tri_vnormal.clear();
 	dscene->tri_vindex.clear();
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 5f33e30eac2..1f8b880c161 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -48,7 +48,7 @@ struct PackedPatchTable;
 
 class Mesh : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	/* Mesh Triangle */
 	struct Triangle {
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index adc5b820298..4acb7911560 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -121,9 +121,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	/* needs to be up to data for attribute access */
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
+	device->mem_alloc("displace_input", d_input, MEM_READ_ONLY);
 	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+	device->mem_alloc("displace_output", d_output, MEM_WRITE_ONLY);
 
 	DeviceTask task(DeviceTask::SHADER);
 	task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 13b149eddfa..7052c03ed94 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -27,6 +27,7 @@
 
 #include "util_sky_model.h"
 #include "util_foreach.h"
+#include "util_logging.h"
 #include "util_transform.h"
 
 CCL_NAMESPACE_BEGIN
@@ -1931,21 +1932,38 @@ GlossyBsdfNode::GlossyBsdfNode()
 void GlossyBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp glossy BSDF.";
 			distribution = CLOSURE_BSDF_REFLECTION_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_REFLECTION_ID)
+		{
+			VLOG(1) << "Using GGX glossy with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -1953,7 +1971,8 @@ void GlossyBsdfNode::simplify_settings(Scene *scene)
 bool GlossyBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_REFLECTION_ID || roughness <= 1e-4f);
 }
 
 void GlossyBsdfNode::compile(SVMCompiler& compiler)
@@ -2008,21 +2027,38 @@ GlassBsdfNode::GlassBsdfNode()
 void GlassBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp glass BSDF.";
 			distribution = CLOSURE_BSDF_SHARP_GLASS_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_SHARP_GLASS_ID)
+		{
+			VLOG(1) << "Using GGX glass with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -2030,7 +2066,8 @@ void GlassBsdfNode::simplify_settings(Scene *scene)
 bool GlassBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_SHARP_GLASS_ID || roughness <= 1e-4f);
 }
 
 void GlassBsdfNode::compile(SVMCompiler& compiler)
@@ -2085,21 +2122,38 @@ RefractionBsdfNode::RefractionBsdfNode()
 void RefractionBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp refraction BSDF.";
 			distribution = CLOSURE_BSDF_REFRACTION_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_REFRACTION_ID)
+		{
+			VLOG(1) << "Using GGX refraction with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -2107,7 +2161,8 @@ void RefractionBsdfNode::simplify_settings(Scene *scene)
 bool RefractionBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_REFRACTION_ID || roughness <= 1e-4f);
 }
 
 void RefractionBsdfNode::compile(SVMCompiler& compiler)
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index eb0f7977dd1..d159c801810 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -388,7 +388,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness;
+	float roughness, roughness_orig;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -400,7 +400,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness, IOR;
+	float roughness, roughness_orig, IOR;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -412,7 +412,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness, IOR;
+	float roughness, roughness_orig, IOR;
 	ClosureType distribution, distribution_orig;
 };
 
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 7e306fab2a8..3495849d149 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -40,7 +40,7 @@ struct Transform;
 
 class Object : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	Mesh *mesh;
 	Transform tfm;
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 8768682043f..9f398c444f4 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -69,6 +69,7 @@ public:
 	device_vector<uint> prim_visibility;
 	device_vector<uint> prim_index;
 	device_vector<uint> prim_object;
+	device_vector<float2> prim_time;
 
 	/* mesh */
 	device_vector<uint> tri_shader;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 7c01934cfd8..0c7bd271371 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -230,7 +230,9 @@ void Session::run_gpu()
 				while(1) {
 					scoped_timer pause_timer;
 					pause_cond.wait(pause_lock);
-					progress.add_skip_time(pause_timer, params.background);
+					if(pause) {
+						progress.add_skip_time(pause_timer, params.background);
+					}
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -520,7 +522,9 @@ void Session::run_cpu()
 				while(1) {
 					scoped_timer pause_timer;
 					pause_cond.wait(pause_lock);
-					progress.add_skip_time(pause_timer, params.background);
+					if(pause) {
+						progress.add_skip_time(pause_timer, params.background);
+					}
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -650,6 +654,8 @@ void Session::load_kernels()
 	if(!kernels_loaded) {
 		progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
+		scoped_timer timer;
+
 		DeviceRequestedFeatures requested_features = get_requested_device_features();
 		VLOG(2) << "Requested features:\n" << requested_features;
 		if(!device->load_kernels(requested_features)) {
@@ -663,6 +669,9 @@ void Session::load_kernels()
 			return;
 		}
 
+		progress.add_skip_time(timer, false);
+		VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start();
+
 		kernels_loaded = true;
 	}
 }
@@ -883,6 +892,7 @@ void Session::path_trace()
 	task.need_finish_queue = params.progressive_refine;
 	task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
 	task.requested_tile_size = params.tile_size;
+	task.passes_size = tile_manager.params.get_passes_size();
 
 	device->task_add(task);
 }
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 7d896652196..490c3f1c95d 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -82,7 +82,7 @@ enum DisplacementMethod {
 
 class Shader : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	int pass_id;
 
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 433e41fbbb6..6c52117ef9a 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -32,6 +32,13 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 	}
 }
 
+#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
+
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE 0
+#define ccl_barrier(flags) (void)0
+
 #else  /* __KERNEL_GPU__ */
 
 #ifdef __KERNEL_OPENCL__
@@ -39,7 +46,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 /* Float atomics implementation credits:
  *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
  */
-ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source,
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
                                         const float operand)
 {
 	union {
@@ -56,10 +63,29 @@ ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *sou
 	} while(atomic_cmpxchg((volatile ccl_global unsigned int *)source,
 	                       prev_value.int_value,
 	                       new_value.int_value) != prev_value.int_value);
+	return new_value.float_value;
 }
 
+#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
+#define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+
+#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) barrier(flags)
+
 #endif  /* __KERNEL_OPENCL__ */
 
+#ifdef __KERNEL_CUDA__
+
+#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
+
+#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) __syncthreads()
+
+#endif  /* __KERNEL_CUDA__ */
+
 #endif  /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 80d177d2cae..f12c5e28c80 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -29,7 +29,8 @@ DebugFlags::CPU::CPU()
     sse41(true),
     sse3(true),
     sse2(true),
-    qbvh(true)
+    qbvh(true),
+    split_kernel(false)
 {
 	reset();
 }
@@ -55,10 +56,12 @@ void DebugFlags::CPU::reset()
 #undef CHECK_CPU_FLAGS
 
 	qbvh = true;
+	split_kernel = false;
 }
 
 DebugFlags::CUDA::CUDA()
-  : adaptive_compile(false)
+  : adaptive_compile(false),
+    split_kernel(false)
 {
 	reset();
 }
@@ -67,6 +70,8 @@ void DebugFlags::CUDA::reset()
 {
 	if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
 		adaptive_compile = true;
+
+	split_kernel = false;
 }
 
 DebugFlags::OpenCL::OpenCL()
@@ -133,7 +138,9 @@ std::ostream& operator <<(std::ostream &os,
 	   << "  AVX    : " << string_from_bool(debug_flags.cpu.avx)   << "\n"
 	   << "  SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
 	   << "  SSE3   : " << string_from_bool(debug_flags.cpu.sse3)  << "\n"
-	   << "  SSE2   : " << string_from_bool(debug_flags.cpu.sse2)  << "\n";
+	   << "  SSE2   : " << string_from_bool(debug_flags.cpu.sse2)  << "\n"
+	   << "  QBVH   : " << string_from_bool(debug_flags.cpu.qbvh)  << "\n"
+	   << "  Split  : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
 
 	os << "CUDA flags:\n"
 	   << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 73fd228b5d9..911c95de4ab 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -46,6 +46,9 @@ public:
 
 		/* Whether QBVH usage is allowed or not. */
 		bool qbvh;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};
 
 	/* Descriptor of CUDA feature-set to be used. */
@@ -58,6 +61,9 @@ public:
 		/* Whether adaptive feature based runtime compile is enabled or not.
 		 * Requires the CUDA Toolkit and only works on Linux atm. */
 		bool adaptive_compile;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};
 
 	/* Descriptor of OpenCL feature-set to be used. */
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 5db3384cda4..c1a47d58c55 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -18,6 +18,7 @@
 #define __UTIL_HALF_H__
 
 #include "util_types.h"
+#include "util_math.h"
 
 #ifdef __KERNEL_SSE2__
 #include "util_simd.h"
@@ -110,6 +111,28 @@ ccl_device_inline float4 half4_to_float4(half4 h)
 	return f;
 }
 
+ccl_device_inline half float_to_half(float f)
+{
+	const uint u = __float_as_uint(f);
+	/* Sign bit, shifted to it's position. */
+	uint sign_bit = u & 0x80000000;
+	sign_bit >>= 16;
+	/* Exponent. */
+	uint exponent_bits = u & 0x7f800000;
+	/* Non-sign bits. */
+	uint value_bits = u & 0x7fffffff;
+	value_bits >>= 13;  /* Align mantissa on MSB. */
+	value_bits -= 0x1c000;  /* Adjust bias. */
+	/* Flush-to-zero. */
+	value_bits = (exponent_bits < 0x38800000) ? 0 : value_bits;
+	/* Clamp-to-max. */
+	value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits;
+	/* Denormals-as-zero. */
+	value_bits = (exponent_bits == 0 ? 0 : value_bits);
+	/* Re-insert sign bit and return. */
+	return (value_bits | sign_bit);
+}
+
 #endif
 
 #endif
diff --git a/intern/cycles/util/util_image_impl.h b/intern/cycles/util/util_image_impl.h
index 73ecfda0855..4daf1eaac22 100644
--- a/intern/cycles/util/util_image_impl.h
+++ b/intern/cycles/util/util_image_impl.h
@@ -19,6 +19,7 @@
 
 #include "util_algorithm.h"
 #include "util_debug.h"
+#include "util_half.h"
 #include "util_image.h"
 
 CCL_NAMESPACE_BEGIN
@@ -38,6 +39,52 @@ const T *util_image_read(const vector<T>& pixels,
 	return &pixels[index];
 }
 
+/* Cast input pixel from unknown storage to float. */
+template<typename T>
+inline float cast_to_float(T value);
+
+template<>
+inline float cast_to_float(float value)
+{
+	return value;
+}
+template<>
+inline float cast_to_float(uchar value)
+{
+	return (float)value / 255.0f;
+}
+template<>
+inline float cast_to_float(half value)
+{
+	return half_to_float(value);
+}
+
+/* Cast float value to output pixel type. */
+template<typename T>
+inline T cast_from_float(float value);
+
+template<>
+inline float cast_from_float(float value)
+{
+	return value;
+}
+template<>
+inline uchar cast_from_float(float value)
+{
+	if(value < 0.0f) {
+		return 0;
+	}
+	else if(value > (1.0f - 0.5f / 255.0f)) {
+		return 255;
+	}
+	return (uchar)((255.0f * value) + 0.5f);
+}
+template<>
+inline half cast_from_float(float value)
+{
+	return float_to_half(value);
+}
+
 template<typename T>
 void util_image_downscale_sample(const vector<T>& pixels,
                                  const size_t width,
@@ -71,15 +118,22 @@ void util_image_downscale_sample(const vector<T>& pixels,
 				                                 components,
 				                                 nx, ny, nz);
 				for(size_t k = 0; k < components; ++k) {
-					accum[k] += pixel[k];
+					accum[k] += cast_to_float(pixel[k]);
 				}
 				++count;
 			}
 		}
 	}
-	const float inv_count = 1.0f / (float)count;
-	for(size_t k = 0; k < components; ++k) {
-		result[k] = T(accum[k] * inv_count);
+	if(count != 0) {
+		const float inv_count = 1.0f / (float)count;
+		for(size_t k = 0; k < components; ++k) {
+			result[k] = cast_from_float<T>(accum[k] * inv_count);
+		}
+	}
+	else {
+		for(size_t k = 0; k < components; ++k) {
+			result[k] = T(0.0f);
+		}
 	}
 }
 
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index 03041723e15..6824f1ff83c 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -69,6 +69,15 @@ void util_logging_verbosity_set(int verbosity)
 }
 
 std::ostream& operator <<(std::ostream &os,
+                          const int2 &value)
+{
+	os << "(" << value.x
+	   << ", " << value.y
+	   << ")";
+	return os;
+}
+
+std::ostream& operator <<(std::ostream &os,
                           const float3 &value)
 {
 	os << "(" << value.x
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 2aa9c25b1a0..ecf9c9cfee0 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -45,6 +45,7 @@ public:
 
 #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
 
+struct int2;
 struct float3;
 
 void util_logging_init(const char *argv0);
@@ -52,6 +53,8 @@ void util_logging_start(void);
 void util_logging_verbosity_set(int verbosity);
 
 std::ostream& operator <<(std::ostream &os,
+                          const int2 &value);
+std::ostream& operator <<(std::ostream &os,
                           const float3 &value);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 2b81c8c498a..ae4b3d77f12 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -1329,7 +1329,7 @@ ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
 	y = (b.y != 0.0f)? a.y/b.y: 0.0f;
 	z = (b.z != 0.0f)? a.z/b.z: 0.0f;
 
-	/* try to get grey even if b is zero */
+	/* try to get gray even if b is zero */
 	if(b.x == 0.0f) {
 		if(b.y == 0.0f) {
 			x = z;
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 5df262fcbbb..1b2e8aace5b 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -814,7 +814,7 @@ string path_source_replace_includes(const string& source,
 						/* Use line directives for better error messages. */
 						line = line_directive(filepath, 1)
 						     + token.replace(0, n_end + 1, "\n" + text + "\n")
-						     + line_directive(path_join(path, source_filename), i);
+						     + line_directive(path_join(path, source_filename), i + 1);
 					}
 				}
 			}
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index 2f5295b5463..cf99a08efae 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -514,12 +514,12 @@ ccl_device_inline float len3(const ssef& a)
 /* faster version for SSSE3 */
 typedef ssei shuffle_swap_t;
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
 {
 	return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 }
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
 {
 	return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
 }
@@ -534,12 +534,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s
 /* somewhat slower version for SSE2 */
 typedef int shuffle_swap_t;
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
 {
 	return 0;
 }
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
 {
 	return 1;
 }
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index 033d85e8ec6..e90049254de 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -43,7 +43,9 @@ template <> class StaticAssertFailure<true> {};
 #    endif  /* __COUNTER__ */
 #  endif  /* C++11 or MSVC2015 */
 #else  /* __KERNEL_GPU__ */
-#  define static_assert(statement, message)
+#  ifndef static_assert
+#    define static_assert(statement, message)
+#  endif
 #endif  /* __KERNEL_GPU__ */
 
 /* TODO(sergey): For until C++11 is a bare minimum for us,
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a000fae4bd6..36d2f1053c7 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -37,6 +37,9 @@
 #define ccl_device_noinline static
 #define ccl_global
 #define ccl_constant
+#define ccl_local
+#define ccl_local_param
+#define ccl_private
 #define ccl_restrict __restrict
 #define __KERNEL_WITH_SSE_ALIGN__
 
@@ -397,11 +400,6 @@ ccl_device_inline float4 make_float4(float x, float y, float z, float w)
 	return a;
 }
 
-ccl_device_inline int align_up(int offset, int alignment)
-{
-	return (offset + alignment - 1) & ~(alignment - 1);
-}
-
 ccl_device_inline int3 make_int3(int i)
 {
 #ifdef __KERNEL_SSE__
@@ -476,6 +474,21 @@ ccl_device_inline int4 make_int4(const float3& f)
 
 #endif
 
+ccl_device_inline int align_up(int offset, int alignment)
+{
+	return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+ccl_device_inline int round_up(int x, int multiple)
+{
+	return ((x + multiple - 1) / multiple) * multiple;
+}
+
+ccl_device_inline int round_down(int x, int multiple)
+{
+	return (x / multiple) * multiple;
+}
+
 /* Interpolation types for textures
  * cuda also use texture space to store other objects */
 enum InterpolationType {
diff --git a/intern/ffmpeg/ffmpeg_compat.h b/intern/ffmpeg/ffmpeg_compat.h
index bcfa24b06a8..d6220ebf562 100644
--- a/intern/ffmpeg/ffmpeg_compat.h
+++ b/intern/ffmpeg/ffmpeg_compat.h
@@ -350,7 +350,12 @@ int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
 FFMPEG_INLINE
 int64_t av_get_pts_from_frame(AVFormatContext *avctx, AVFrame * picture)
 {
-	int64_t pts = picture->pkt_pts;
+	int64_t pts;
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(55, 34, 100)
+	pts = picture->pts;
+#else
+	pts = picture->pkt_pts;
+#endif
 
 	if (pts == AV_NOPTS_VALUE) {
 		pts = picture->pkt_dts;
diff --git a/intern/libmv/libmv/numeric/numeric.h b/intern/libmv/libmv/numeric/numeric.h
index a42dab8c7a2..1a23d653676 100644
--- a/intern/libmv/libmv/numeric/numeric.h
+++ b/intern/libmv/libmv/numeric/numeric.h
@@ -36,7 +36,7 @@
 #if !defined(__MINGW64__)
 #  if defined(_WIN32) || defined(__APPLE__) || \
       defined(__FreeBSD__) || defined(__NetBSD__)
-static void sincos(double x, double *sinx, double *cosx) {
+inline void sincos(double x, double *sinx, double *cosx) {
   *sinx = sin(x);
   *cosx = cos(x);
 }
diff --git a/make.bat b/make.bat
index 337a5517656..dd82b1ca8a7 100644
--- a/make.bat
+++ b/make.bat
@@ -5,8 +5,8 @@ REM This is for users who like to configure & build Blender with a single comman
 setlocal ENABLEEXTENSIONS
 set BLENDER_DIR=%~dp0
 set BLENDER_DIR_NOSPACES=%BLENDER_DIR: =%
-if not "%BLENDER_DIR%"=="%BLENDER_DIR_NOSPACES%" ( 
-	echo There are spaces detected in the build path "%BLENDER_DIR%", this is currently not supported, exiting.... 
+if not "%BLENDER_DIR%"=="%BLENDER_DIR_NOSPACES%" (
+	echo There are spaces detected in the build path "%BLENDER_DIR%", this is currently not supported, exiting....
 	goto EOF
 )
 set BUILD_DIR=%BLENDER_DIR%..\build_windows
@@ -79,7 +79,7 @@ if NOT "%1" == "" (
 	set NOBUILD=1
 	)	else if "%1" == "showhash" (
 		for /f "delims=" %%i in ('git rev-parse HEAD') do echo Branch_hash=%%i
-		cd release/datafiles/locale 
+		cd release/datafiles/locale
 		for /f "delims=" %%i in ('git rev-parse HEAD') do echo Locale_hash=%%i
 		cd %~dp0
 		cd release/scripts/addons
@@ -132,13 +132,13 @@ if "%BUILD_ARCH%"=="x64" (
 
 
 if "%target%"=="Release" (
-		rem for vc12 check for both cuda 7.5 and 8 
+		rem for vc12 check for both cuda 7.5 and 8
 		if "%CUDA_PATH%"=="" (
 			echo Cuda Not found, aborting!
 			goto EOF
 		)
 		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
-		-C"%BLENDER_DIR%\build_files\cmake\config\blender_release.cmake" 
+		-C"%BLENDER_DIR%\build_files\cmake\config\blender_release.cmake"
 )
 
 :DetectMSVC
@@ -157,7 +157,7 @@ if DEFINED MSVC_VC_DIR goto msvc_detect_finally
 if DEFINED MSVC_VC_DIR call "%MSVC_VC_DIR%\vcvarsall.bat"
 if DEFINED MSVC_VC_DIR goto sanity_checks
 
-rem MSVC Build environment 2017 and up. 
+rem MSVC Build environment 2017 and up.
 for /F "usebackq skip=2 tokens=1-2*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\VisualStudio\SXS\VS7" /v %BUILD_VS_VER%.0 2^>nul`) DO set MSVC_VS_DIR=%%C
 if DEFINED MSVC_VS_DIR goto msvc_detect_finally_2017
 REM Check 32 bits
@@ -202,7 +202,7 @@ if NOT EXIST %BLENDER_DIR%..\lib\nul (
 if "%TARGET%"=="" (
 	echo Error: Convenience target not set
 	echo This is required for building, aborting!
-	echo . 
+	echo .
 	goto HELP
 )
 
@@ -266,15 +266,15 @@ echo.
 echo At any point you can optionally modify your build configuration by editing:
 echo "%BUILD_DIR%\CMakeCache.txt", then run "make" again to build with the changes applied.
 echo.
-echo Blender successfully built, run from: "%BUILD_DIR%\bin\%BUILD_TYPE%"
+echo Blender successfully built, run from: "%BUILD_DIR%\bin\%BUILD_TYPE%\blender.exe"
 echo.
 goto EOF
 :HELP
 		echo.
 		echo Convenience targets
-		echo - release ^(identical to the offical blender.org builds^)
+		echo - release ^(identical to the official blender.org builds^)
 		echo - full ^(same as release minus the cuda kernels^)
-		echo - lite 
+		echo - lite
 		echo - headless
 		echo - cycles
 		echo - bpy
@@ -289,11 +289,10 @@ goto EOF
 		echo - with_tests ^(enable building unit tests^)
 		echo - debug ^(Build an unoptimized debuggable build^)
 		echo - packagename [newname] ^(override default cpack package name^)
-		echo - x86 ^(override host autodetect and build 32 bit code^)
-		echo - x64 ^(override host autodetect and build 64 bit code^)
+		echo - x86 ^(override host auto-detect and build 32 bit code^)
+		echo - x64 ^(override host auto-detect and build 64 bit code^)
 		echo - 2013 ^(build with visual studio 2013^)
 		echo - 2015 ^(build with visual studio 2015^) [EXPERIMENTAL]
 		echo.
 
 :EOF
-
diff --git a/release/scripts/freestyle/modules/freestyle/shaders.py b/release/scripts/freestyle/modules/freestyle/shaders.py
index 633def38b5b..bce6642220b 100644
--- a/release/scripts/freestyle/modules/freestyle/shaders.py
+++ b/release/scripts/freestyle/modules/freestyle/shaders.py
@@ -568,7 +568,7 @@ class pyRandomColorShader(StrokeShader):
 
 class py2DCurvatureColorShader(StrokeShader):
     """
-    Assigns a color (greyscale) to the stroke based on the curvature.
+    Assigns a color (grayscale) to the stroke based on the curvature.
     A higher curvature will yield a brighter color.
     """
     def shade(self, stroke):
@@ -584,7 +584,7 @@ class py2DCurvatureColorShader(StrokeShader):
 
 class pyTimeColorShader(StrokeShader):
     """
-    Assigns a greyscale value that increases for every vertex.
+    Assigns a grayscale value that increases for every vertex.
     The brightness will increase along the stroke.
     """
     def __init__(self, step=0.01):
diff --git a/release/scripts/freestyle/modules/parameter_editor.py b/release/scripts/freestyle/modules/parameter_editor.py
index 93305cb7c5a..b093920a4cb 100644
--- a/release/scripts/freestyle/modules/parameter_editor.py
+++ b/release/scripts/freestyle/modules/parameter_editor.py
@@ -1170,6 +1170,7 @@ class Seed:
 
 _seed = Seed()
 
+
 def get_dashed_pattern(linestyle):
     """Extracts the dashed pattern from the various UI options """
     pattern = []
@@ -1185,6 +1186,15 @@ def get_dashed_pattern(linestyle):
     return pattern
 
 
+def get_grouped_objects(group):
+    for ob in group.objects:
+        if ob.dupli_type == 'GROUP' and ob.dupli_group is not None:
+            for dupli in get_grouped_objects(ob.dupli_group):
+                yield dupli
+        else:
+            yield ob
+
+
 integration_types = {
     'MEAN': IntegrationType.MEAN,
     'MIN': IntegrationType.MIN,
@@ -1267,7 +1277,7 @@ def process(layer_name, lineset_name):
     # prepare selection criteria by group of objects
     if lineset.select_by_group:
         if lineset.group is not None:
-            names = {getQualifiedObjectName(ob): True for ob in lineset.group.objects}
+            names = {getQualifiedObjectName(ob): True for ob in get_grouped_objects(lineset.group)}
             upred = ObjectNamesUP1D(names, lineset.group_negation == 'EXCLUSIVE')
             selection_criteria.append(upred)
     # prepare selection criteria by image border
diff --git a/release/scripts/modules/addon_utils.py b/release/scripts/modules/addon_utils.py
index 0f096f5812c..886f078f046 100644
--- a/release/scripts/modules/addon_utils.py
+++ b/release/scripts/modules/addon_utils.py
@@ -31,8 +31,9 @@ __all__ = (
 import bpy as _bpy
 _user_preferences = _bpy.context.user_preferences
 
-error_duplicates = False
 error_encoding = False
+# (name, file, path)
+error_duplicates = []
 addons_fake_modules = {}
 
 
@@ -57,12 +58,11 @@ def paths():
 
 
 def modules_refresh(module_cache=addons_fake_modules):
-    global error_duplicates
     global error_encoding
     import os
 
-    error_duplicates = False
     error_encoding = False
+    error_duplicates.clear()
 
     path_list = paths()
 
@@ -168,7 +168,7 @@ def modules_refresh(module_cache=addons_fake_modules):
                 if mod.__file__ != mod_path:
                     print("multiple addons with the same name:\n  %r\n  %r" %
                           (mod.__file__, mod_path))
-                    error_duplicates = True
+                    error_duplicates.append((mod.bl_info["name"], mod.__file__, mod_path))
 
                 elif mod.__time__ != os.path.getmtime(mod_path):
                     print("reloading addon:",
diff --git a/release/scripts/presets/interface_theme/back_to_black.xml b/release/scripts/presets/interface_theme/back_to_black.xml
index 915e9cb64f1..1636f5b5cf6 100644
--- a/release/scripts/presets/interface_theme/back_to_black.xml
+++ b/release/scripts/presets/interface_theme/back_to_black.xml
@@ -18,7 +18,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_regular>
         <wcol_tool>
@@ -30,19 +30,19 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_tool>
         <wcol_radio>
           <ThemeWidgetColors outline="#2a2a2a"
                              inner="#111111ff"
                              inner_sel="#33406bff"
-                             item="#191919ff"
+                             item="#444444ff"
                              text="#929292"
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_radio>
         <wcol_text>
@@ -50,23 +50,23 @@
                              inner="#111111ff"
                              inner_sel="#33406bff"
                              item="#191919ff"
-                             text="#e4e4e4"
+                             text="#929292"
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_text>
         <wcol_option>
-          <ThemeWidgetColors outline="#2a2a2a"
+          <ThemeWidgetColors outline="#535353"
                              inner="#111111ff"
                              inner_sel="#33406bff"
-                             item="#000000ff"
-                             text="#c7c7c7"
+                             item="#a3a3a3ff"
+                             text="#929292"
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_option>
         <wcol_toggle>
@@ -78,7 +78,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_toggle>
         <wcol_num>
@@ -90,7 +90,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_num>
         <wcol_numslider>
@@ -102,7 +102,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_numslider>
         <wcol_box>
@@ -114,7 +114,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_box>
         <wcol_menu>
@@ -126,7 +126,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_menu>
         <wcol_pulldown>
@@ -138,7 +138,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_pulldown>
         <wcol_menu_back>
@@ -150,7 +150,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_menu_back>
         <wcol_pie_menu>
@@ -170,7 +170,7 @@
                              inner="#191919e6"
                              inner_sel="#2d2d2de6"
                              item="#646464ff"
-                             text="#ffffff"
+                             text="#929292"
                              text_sel="#ffffff"
                              show_shaded="FALSE"
                              shadetop="25"
@@ -186,7 +186,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_menu_item>
         <wcol_scroll>
@@ -198,7 +198,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_scroll>
         <wcol_progress>
@@ -210,7 +210,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_progress>
         <wcol_list_item>
@@ -222,7 +222,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_list_item>
         <wcol_state>
@@ -239,32 +239,35 @@
     </user_interface>
     <view_3d>
       <ThemeView3D grid="#222222"
+                   clipping_border_3d="#313131ff"
                    wire="#888888"
-                   wire_edit="#000000"
+                   wire_edit="#6c75ff"
                    gp_vertex="#000000"
                    gp_vertex_select="#ff8500"
                    gp_vertex_size="3"
-                   lamp="#c1d40028"
-                   speaker="#535353"
-                   camera="#000000"
-                   view_overlay="#000000"
-                   empty="#000000"
+                   text_grease_pencil="#b5e61d"
                    object_selected="#f15800"
                    object_active="#ff8c19"
                    object_grouped="#083008"
                    object_grouped_active="#55bb55"
-                   transform="#ffffff"
+                   text_keyframe="#ddd700"
+                   camera="#535353"
+                   empty="#535353"
+                   lamp="#fff0d328"
+                   speaker="#535353"
                    vertex="#72cfdd"
                    vertex_select="#ff8500"
                    vertex_size="3"
+                   vertex_bevel="#00a5ff"
                    vertex_unreferenced="#000000"
                    edge_select="#ffa000"
                    edge_seam="#db2512"
                    edge_sharp="#ff2020"
                    edge_crease="#cc0099"
+                   edge_bevel="#00a5ff"
                    edge_facesel="#6b6b6b"
                    freestyle_edge_mark="#7fff7f"
-                   face="#73828f12"
+                   face="#73828f41"
                    face_select="#ffa4003c"
                    face_dot="#ffa900"
                    facedot_size="4"
@@ -291,19 +294,18 @@
                    normal="#22dddd"
                    vertex_normal="#2361dd"
                    split_normal="#dd23dd"
-                   bone_solid="#c8c8c8"
                    bone_pose="#50c8ff"
                    bone_pose_active="#8cffff"
-                   frame_current="#60c040"
-                   outline_width="1"
+                   bone_solid="#c8c8c8"
                    bundle_solid="#c8c8c8"
                    camera_path="#5a5a5a"
                    skin_root="#000000"
-                   clipping_border_3d="#313131ff"
-                   text_keyframe="#ddd700"
-                   text_grease_pencil="#b5e61d"
+                   view_overlay="#000000"
+                   transform="#ffffff"
+                   frame_current="#60c040"
                    paint_curve_handle="#7fff7f7f"
-                   paint_curve_pivot="#ff7f7f7f">
+                   paint_curve_pivot="#ff7f7f7f"
+                   outline_width="1">
         <space>
           <ThemeSpaceGradient title="#5d5d5d"
                               text="#7d7d7d"
@@ -312,23 +314,23 @@
                               header_text="#979797"
                               header_text_hi="#ffffff"
                               button="#00000057"
-                              button_title="#c5c5c5"
+                              button_title="#929292"
                               button_text="#c3c3c3"
-                              button_text_hi="#ffffff"
+                              button_text_hi="#e5e5e5"
                               tab_active="#212947"
                               tab_inactive="#000000"
                               tab_back="#060606ff"
                               tab_outline="#000000">
             <gradients>
               <ThemeGradientColors show_grad="TRUE"
-                                   gradient="#0a0a0a"
+                                   gradient="#1d1d1d"
                                    high_gradient="#000000">
               </ThemeGradientColors>
             </gradients>
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -348,6 +350,7 @@
                         vertex="#ffffff"
                         vertex_select="#ff8500"
                         vertex_size="3"
+                        vertex_bevel="#000000"
                         vertex_unreferenced="#000000"
                         handle_free="#808080"
                         handle_auto="#909000"
@@ -382,7 +385,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -418,7 +421,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -463,7 +466,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -501,6 +504,7 @@
                       keyframe_jitter_selected="#61c042"
                       keyframe_border="#000000ff"
                       keyframe_border_selected="#000000ff"
+                      keyframe_scale_factor="1"
                       summary="#00000000">
         <space>
           <ThemeSpaceGeneric back="#080808"
@@ -521,7 +525,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -543,6 +547,7 @@
                         vertex="#0f13bb"
                         vertex_select="#ff8500"
                         vertex_size="3"
+                        vertex_bevel="#000000"
                         vertex_unreferenced="#000000"
                         face="#ffffff0a"
                         face_select="#ff85003c"
@@ -596,7 +601,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -644,7 +649,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -673,7 +678,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -712,7 +717,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -744,7 +749,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -799,7 +804,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -835,7 +840,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -865,7 +870,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -903,7 +908,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -932,7 +937,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -966,7 +971,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -1019,7 +1024,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
diff --git a/release/scripts/presets/keyconfig/3dsmax.py b/release/scripts/presets/keyconfig/3dsmax.py
index 7694e338d68..6d05ff6982c 100644
--- a/release/scripts/presets/keyconfig/3dsmax.py
+++ b/release/scripts/presets/keyconfig/3dsmax.py
@@ -401,6 +401,12 @@ kmi = km.keymap_items.new('particle.hide', 'H', 'PRESS')
 kmi.properties.unselected = False
 kmi = km.keymap_items.new('particle.hide', 'H', 'PRESS', shift=True)
 kmi.properties.unselected = True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_planar_constraint= True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate= True
 kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
 kmi.properties.release_confirm = True
 kmi = km.keymap_items.new('particle.brush_edit', 'LEFTMOUSE', 'PRESS')
@@ -422,6 +428,12 @@ kmi.properties.value_2 = 'ENABLED'
 # Map 3D View
 km = kc.keymaps.new('3D View', space_type='VIEW_3D', region_type='WINDOW', modal=False)
 
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_planar_constraint= True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate= True
 kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
 kmi.properties.release_confirm = True
 kmi = km.keymap_items.new('view3d.cursor3d', 'ACTIONMOUSE', 'PRESS')
diff --git a/release/scripts/presets/keyconfig/maya.py b/release/scripts/presets/keyconfig/maya.py
index 67fd1fddcac..53129593a59 100644
--- a/release/scripts/presets/keyconfig/maya.py
+++ b/release/scripts/presets/keyconfig/maya.py
@@ -933,6 +933,12 @@ km = kc.keymaps.new('3D View', space_type='VIEW_3D', region_type='WINDOW', modal
 
 kmi = km.keymap_items.new('view3d.cursor3d', 'ACTIONMOUSE', 'PRESS')
 kmi = km.keymap_items.new('view3d.rotate', 'LEFTMOUSE', 'PRESS', alt=True)
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_planar_constraint = True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate = True
 kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
 kmi.properties.release_confirm = True
 kmi = km.keymap_items.new('view3d.move', 'MIDDLEMOUSE', 'PRESS', alt=True)
diff --git a/release/scripts/startup/bl_operators/object_quick_effects.py b/release/scripts/startup/bl_operators/object_quick_effects.py
index ef10e279bb4..0b9e7fd7305 100644
--- a/release/scripts/startup/bl_operators/object_quick_effects.py
+++ b/release/scripts/startup/bl_operators/object_quick_effects.py
@@ -319,7 +319,7 @@ class QuickSmoke(Operator):
 
     def execute(self, context):
         if not bpy.app.build_options.mod_smoke:
-            self.report({'ERROR'}, "Build without Smoke modifier support")
+            self.report({'ERROR'}, "Built without Smoke modifier support")
             return {'CANCELLED'}
 
         fake_context = context.copy()
@@ -568,7 +568,7 @@ class QuickFluid(Operator):
 
     def execute(self, context):
         if not bpy.app.build_options.mod_fluid:
-            self.report({'ERROR'}, "Build without Fluid modifier support")
+            self.report({'ERROR'}, "Built without Fluid modifier support")
             return {'CANCELLED'}
 
         fake_context = context.copy()
diff --git a/release/scripts/startup/bl_operators/wm.py b/release/scripts/startup/bl_operators/wm.py
index f5460d58d44..869070ed778 100644
--- a/release/scripts/startup/bl_operators/wm.py
+++ b/release/scripts/startup/bl_operators/wm.py
@@ -1008,11 +1008,9 @@ class WM_OT_doc_view(Operator):
 
     doc_id = doc_id
     if bpy.app.version_cycle == "release":
-        _prefix = ("https://www.blender.org/api/blender_python_api_%s%s_release" %
-                   ("_".join(str(v) for v in bpy.app.version[:2]), bpy.app.version_char))
+        _prefix = ("https://docs.blender.org/api/blender_python_api_current")
     else:
-        _prefix = ("https://www.blender.org/api/blender_python_api_%s" %
-                   "_".join(str(v) for v in bpy.app.version))
+        _prefix = ("https://docs.blender.org/api/blender_python_api_master")
 
     def execute(self, context):
         url = _wm_doc_get_id(self.doc_id, do_url=True, url_prefix=self._prefix)
diff --git a/release/scripts/startup/bl_ui/properties_data_modifier.py b/release/scripts/startup/bl_ui/properties_data_modifier.py
index d66fb08bcd6..68b6265ab15 100644
--- a/release/scripts/startup/bl_ui/properties_data_modifier.py
+++ b/release/scripts/startup/bl_ui/properties_data_modifier.py
@@ -951,6 +951,23 @@ class DATA_PT_modifiers(ModifierButtonsPanel, Panel):
     def SURFACE(self, layout, ob, md):
         layout.label(text="Settings are inside the Physics tab")
 
+    def SURFACE_DEFORM(self, layout, ob, md):
+        col = layout.column()
+        col.active = not md.is_bound
+
+        col.prop(md, "target")
+        col.prop(md, "falloff")
+
+        layout.separator()
+
+        col = layout.column()
+        col.active = md.target is not None
+
+        if md.is_bound:
+            col.operator("object.surfacedeform_bind", text="Unbind")
+        else:
+            col.operator("object.surfacedeform_bind", text="Bind")
+
     def UV_PROJECT(self, layout, ob, md):
         split = layout.split()
 
@@ -1320,7 +1337,9 @@ class DATA_PT_modifiers(ModifierButtonsPanel, Panel):
         row.prop(md, "thickness_vertex_group", text="Factor")
 
         col.prop(md, "use_crease", text="Crease Edges")
-        col.prop(md, "crease_weight", text="Crease Weight")
+        row = col.row()
+        row.active = md.use_crease
+        row.prop(md, "crease_weight", text="Crease Weight")
 
         col = split.column()
 
diff --git a/release/scripts/startup/bl_ui/properties_object.py b/release/scripts/startup/bl_ui/properties_object.py
index 3ff7a248c60..4a596981983 100644
--- a/release/scripts/startup/bl_ui/properties_object.py
+++ b/release/scripts/startup/bl_ui/properties_object.py
@@ -152,6 +152,33 @@ class OBJECT_PT_relations(ObjectButtonsPanel, Panel):
         sub.active = (parent is not None)
 
 
+class OBJECT_PT_relations_extras(ObjectButtonsPanel, Panel):
+    bl_label = "Relations Extras"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw(self, context):
+        layout = self.layout
+
+        ob = context.object
+
+        split = layout.split()
+
+        if context.scene.render.engine != 'BLENDER_GAME':
+            col = split.column()
+            col.label(text="Tracking Axes:")
+            col.prop(ob, "track_axis", text="Axis")
+            col.prop(ob, "up_axis", text="Up Axis")
+
+        col = split.column()
+        col.prop(ob, "use_slow_parent")
+        row = col.row()
+        row.active = ((ob.parent is not None) and (ob.use_slow_parent))
+        row.prop(ob, "slow_parent_offset", text="Offset")
+
+        layout.prop(ob, "use_extra_recalc_object")
+        layout.prop(ob, "use_extra_recalc_data")
+
+
 class GROUP_MT_specials(Menu):
     bl_label = "Group Specials"
 
@@ -296,33 +323,6 @@ class OBJECT_PT_duplication(ObjectButtonsPanel, Panel):
             layout.prop(ob, "dupli_group", text="Group")
 
 
-class OBJECT_PT_relations_extras(ObjectButtonsPanel, Panel):
-    bl_label = "Relations Extras"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    def draw(self, context):
-        layout = self.layout
-
-        ob = context.object
-
-        split = layout.split()
-
-        if context.scene.render.engine != 'BLENDER_GAME':
-            col = split.column()
-            col.label(text="Tracking Axes:")
-            col.prop(ob, "track_axis", text="Axis")
-            col.prop(ob, "up_axis", text="Up Axis")
-
-        col = split.column()
-        col.prop(ob, "use_slow_parent")
-        row = col.row()
-        row.active = ((ob.parent is not None) and (ob.use_slow_parent))
-        row.prop(ob, "slow_parent_offset", text="Offset")
-
-        layout.prop(ob, "use_extra_recalc_object")
-        layout.prop(ob, "use_extra_recalc_data")
-
-
 from bl_ui.properties_animviz import (
         MotionPathButtonsPanel,
         OnionSkinButtonsPanel,
diff --git a/release/scripts/startup/bl_ui/properties_physics_common.py b/release/scripts/startup/bl_ui/properties_physics_common.py
index 277b59d187d..4478c6a4379 100644
--- a/release/scripts/startup/bl_ui/properties_physics_common.py
+++ b/release/scripts/startup/bl_ui/properties_physics_common.py
@@ -274,6 +274,8 @@ def basic_force_field_settings_ui(self, context, field):
         col.prop(field, "use_global_coords", text="Global")
     elif field.type == 'HARMONIC':
         col.prop(field, "use_multiple_springs")
+    if field.type == 'FORCE':
+        col.prop(field, "use_gravity_falloff",  text="Gravitation")
 
     split = layout.split()
 
diff --git a/release/scripts/startup/bl_ui/space_sequencer.py b/release/scripts/startup/bl_ui/space_sequencer.py
index 8ab5b4724b8..67ffaa7fccf 100644
--- a/release/scripts/startup/bl_ui/space_sequencer.py
+++ b/release/scripts/startup/bl_ui/space_sequencer.py
@@ -668,14 +668,14 @@ class SEQUENCER_PT_effect(SequencerButtonsPanel, Panel):
                     if (i % BT_ROW) == 1:
                         row = col.row(align=True)
 
-                    # Workaround - .active has to have a separate UI block to work
+                    # Workaround - .enabled has to have a separate UI block to work
                     if i == strip.multicam_source:
                         sub = row.row(align=True)
-                        sub.active = False
+                        sub.enabled = False
                         sub.operator("sequencer.cut_multicam", text="%d" % i).camera = i
                     else:
                         sub_1 = row.row(align=True)
-                        sub_1.active = True
+                        sub_1.enabled = True
                         sub_1.operator("sequencer.cut_multicam", text="%d" % i).camera = i
 
                 if strip.channel > BT_ROW and (strip_channel - 1) % BT_ROW:
@@ -683,7 +683,7 @@ class SEQUENCER_PT_effect(SequencerButtonsPanel, Panel):
                         row.label("")
             else:
                 col.separator()
-                col.label(text="Two or more channels are needed below this strip.", icon="INFO")
+                col.label(text="Two or more channels are needed below this strip", icon="INFO")
 
 
         elif strip.type == 'TEXT':
diff --git a/release/scripts/startup/bl_ui/space_time.py b/release/scripts/startup/bl_ui/space_time.py
index 508e62e4f56..1f62d8d6968 100644
--- a/release/scripts/startup/bl_ui/space_time.py
+++ b/release/scripts/startup/bl_ui/space_time.py
@@ -49,7 +49,10 @@ class TIME_HT_header(Header):
             row.prop(scene, "frame_preview_start", text="Start")
             row.prop(scene, "frame_preview_end", text="End")
 
-        layout.prop(scene, "frame_current", text="")
+        if scene.show_subframe:
+            layout.prop(scene, "frame_float", text="")
+        else:
+            layout.prop(scene, "frame_current", text="")
 
         layout.separator()
 
@@ -135,6 +138,7 @@ class TIME_MT_view(Menu):
 
         layout.prop(st, "show_frame_indicator")
         layout.prop(scene, "show_keys_from_selected_only")
+        layout.prop(scene, "show_subframe")
 
         layout.separator()
 
diff --git a/release/scripts/startup/bl_ui/space_userpref.py b/release/scripts/startup/bl_ui/space_userpref.py
index 075a6f870fa..cd12255c24c 100644
--- a/release/scripts/startup/bl_ui/space_userpref.py
+++ b/release/scripts/startup/bl_ui/space_userpref.py
@@ -453,6 +453,7 @@ class USERPREF_PT_system(Panel):
         col.separator()
         col.label(text="Selection")
         col.prop(system, "select_method", text="")
+        col.prop(system, "use_select_pick_depth")
 
         col.separator()
 
@@ -1243,7 +1244,7 @@ class USERPREF_MT_addons_online_resources(Menu):
                 "wm.url_open", text="API Concepts", icon='URL',
                 ).url = bpy.types.WM_OT_doc_view._prefix + "/info_quickstart.html"
         layout.operator("wm.url_open", text="Add-on Tutorial", icon='URL',
-                ).url = "http://www.blender.org/api/blender_python_api_current/info_tutorial_addon.html"
+                ).url = bpy.types.WM_OT_doc_view._prefix + "/info_tutorial_addon.html"
 
 
 class USERPREF_PT_addons(Panel):
@@ -1317,11 +1318,18 @@ class USERPREF_PT_addons(Panel):
 
         # set in addon_utils.modules_refresh()
         if addon_utils.error_duplicates:
-            self.draw_error(col,
-                            "Multiple addons using the same name found!\n"
-                            "likely a problem with the script search path.\n"
-                            "(see console for details)",
-                            )
+            box = col.box()
+            row = box.row()
+            row.label("Multiple addons with the same name found!")
+            row.label(icon='ERROR')
+            box.label("Please delete one of each pair:")
+            for (addon_name, addon_file, addon_path) in addon_utils.error_duplicates:
+                box.separator()
+                sub_col = box.column(align=True)
+                sub_col.label(addon_name + ":")
+                sub_col.label("    " + addon_file)
+                sub_col.label("    " + addon_path)
+
 
         if addon_utils.error_encoding:
             self.draw_error(col,
diff --git a/source/blender/alembic/intern/abc_archive.cc b/source/blender/alembic/intern/abc_archive.cc
index 0985a06d732..5f8fc1a3739 100644
--- a/source/blender/alembic/intern/abc_archive.cc
+++ b/source/blender/alembic/intern/abc_archive.cc
@@ -113,25 +113,25 @@ static OArchive create_archive(std::ostream *ostream,
                                Alembic::Abc::MetaData &md,
                                bool ogawa)
 {
-    md.set(Alembic::Abc::kApplicationNameKey, "Blender");
+	md.set(Alembic::Abc::kApplicationNameKey, "Blender");
 	md.set(Alembic::Abc::kUserDescriptionKey, scene_name);
 
-    time_t raw_time;
-    time(&raw_time);
-    char buffer[128];
+	time_t raw_time;
+	time(&raw_time);
+	char buffer[128];
 
 #if defined _WIN32 || defined _WIN64
-    ctime_s(buffer, 128, &raw_time);
+	ctime_s(buffer, 128, &raw_time);
 #else
-    ctime_r(&raw_time, buffer);
+	ctime_r(&raw_time, buffer);
 #endif
 
-    const std::size_t buffer_len = strlen(buffer);
-    if (buffer_len > 0 && buffer[buffer_len - 1] == '\n') {
-        buffer[buffer_len - 1] = '\0';
-    }
+	const std::size_t buffer_len = strlen(buffer);
+	if (buffer_len > 0 && buffer[buffer_len - 1] == '\n') {
+		buffer[buffer_len - 1] = '\0';
+	}
 
-    md.set(Alembic::Abc::kDateWrittenKey, buffer);
+	md.set(Alembic::Abc::kDateWrittenKey, buffer);
 
 	ErrorHandler::Policy policy = ErrorHandler::kThrowPolicy;
 
diff --git a/source/blender/alembic/intern/abc_curves.cc b/source/blender/alembic/intern/abc_curves.cc
index 282777f3af0..0542255d84b 100644
--- a/source/blender/alembic/intern/abc_curves.cc
+++ b/source/blender/alembic/intern/abc_curves.cc
@@ -361,7 +361,7 @@ void read_curve_sample(Curve *cu, const ICurvesSchema &schema, const float time)
  * object directly and create a new DerivedMesh from that. Also we might need to
  * create new or delete existing NURBS in the curve.
  */
-DerivedMesh *AbcCurveReader::read_derivedmesh(DerivedMesh */*dm*/, const float time, int /*read_flag*/, const char **/*err_str*/)
+DerivedMesh *AbcCurveReader::read_derivedmesh(DerivedMesh * /*dm*/, const float time, int /*read_flag*/, const char ** /*err_str*/)
 {
 	ISampleSelector sample_sel(time);
 	const ICurvesSchema::Sample sample = m_curves_schema.getValue(sample_sel);
diff --git a/source/blender/alembic/intern/abc_customdata.cc b/source/blender/alembic/intern/abc_customdata.cc
index ebf1b2ba96e..0d11ab79ddd 100644
--- a/source/blender/alembic/intern/abc_customdata.cc
+++ b/source/blender/alembic/intern/abc_customdata.cc
@@ -327,6 +327,11 @@ static void read_custom_data_ex(const ICompoundProperty &prop,
 	}
 	else if (data_type == CD_MLOOPUV) {
 		IV2fGeomParam uv_param(prop, prop_header.getName());
+
+		if (!uv_param.isIndexed()) {
+			return;
+		}
+
 		IV2fGeomParam::Sample sample;
 		uv_param.getIndexed(sample, iss);
 
diff --git a/source/blender/alembic/intern/abc_exporter.cc b/source/blender/alembic/intern/abc_exporter.cc
index d17506ff8b0..90a99469389 100644
--- a/source/blender/alembic/intern/abc_exporter.cc
+++ b/source/blender/alembic/intern/abc_exporter.cc
@@ -47,7 +47,7 @@ extern "C" {
 
 #ifdef WIN32
 /* needed for MSCV because of snprintf from BLI_string */
-#	include "BLI_winstuff.h"
+#  include "BLI_winstuff.h"
 #endif
 
 #include "BKE_anim.h"
diff --git a/source/blender/alembic/intern/abc_mesh.cc b/source/blender/alembic/intern/abc_mesh.cc
index 8bc9c335054..5a57e43326a 100644
--- a/source/blender/alembic/intern/abc_mesh.cc
+++ b/source/blender/alembic/intern/abc_mesh.cc
@@ -691,7 +691,7 @@ static void assign_materials(Main *bmain, Object *ob, const std::map<std::string
 				assigned_name = mat_iter->second;
 			}
 
-			assign_material(ob, assigned_name, it->second, BKE_MAT_ASSIGN_OBJECT);
+			assign_material(ob, assigned_name, it->second, BKE_MAT_ASSIGN_OBDATA);
 		}
 	}
 }
diff --git a/source/blender/alembic/intern/abc_points.cc b/source/blender/alembic/intern/abc_points.cc
index 4c78f3e83c7..fc84759b1d9 100644
--- a/source/blender/alembic/intern/abc_points.cc
+++ b/source/blender/alembic/intern/abc_points.cc
@@ -200,7 +200,7 @@ void read_points_sample(const IPointsSchema &schema,
 	read_mverts(config.mvert, positions, vnormals);
 }
 
-DerivedMesh *AbcPointsReader::read_derivedmesh(DerivedMesh *dm, const float time, int /*read_flag*/, const char **/*err_str*/)
+DerivedMesh *AbcPointsReader::read_derivedmesh(DerivedMesh *dm, const float time, int /*read_flag*/, const char ** /*err_str*/)
 {
 	ISampleSelector sample_sel(time);
 	const IPointsSchema::Sample sample = m_schema.getValue(sample_sel);
diff --git a/source/blender/alembic/intern/abc_transform.cc b/source/blender/alembic/intern/abc_transform.cc
index 368a811bb2a..2c6ef09326c 100644
--- a/source/blender/alembic/intern/abc_transform.cc
+++ b/source/blender/alembic/intern/abc_transform.cc
@@ -122,7 +122,7 @@ Imath::Box3d AbcTransformWriter::bounds()
 	return Imath::transform(bounds, m_matrix);
 }
 
-bool AbcTransformWriter::hasAnimation(Object */*ob*/) const
+bool AbcTransformWriter::hasAnimation(Object * /*ob*/) const
 {
 	/* TODO(kevin): implement this. */
 	return true;
diff --git a/source/blender/alembic/intern/abc_util.cc b/source/blender/alembic/intern/abc_util.cc
index 08c94f437e6..50fa43a3491 100644
--- a/source/blender/alembic/intern/abc_util.cc
+++ b/source/blender/alembic/intern/abc_util.cc
@@ -37,6 +37,8 @@ extern "C" {
 #include "DNA_object_types.h"
 
 #include "BLI_math.h"
+
+#include "PIL_time.h"
 }
 
 std::string get_id_name(Object *ob)
@@ -523,3 +525,15 @@ AbcObjectReader *create_reader(const Alembic::AbcGeom::IObject &object, ImportSe
 
 	return reader;
 }
+
+/* ********************** */
+
+ScopeTimer::ScopeTimer(const char *message)
+	: m_message(message)
+	, m_start(PIL_check_seconds_timer())
+{}
+
+ScopeTimer::~ScopeTimer()
+{
+	fprintf(stderr, "%s: %fs\n", m_message, PIL_check_seconds_timer() - m_start);
+}
diff --git a/source/blender/alembic/intern/abc_util.h b/source/blender/alembic/intern/abc_util.h
index a7ac9df91c7..85ba4d5c9c7 100644
--- a/source/blender/alembic/intern/abc_util.h
+++ b/source/blender/alembic/intern/abc_util.h
@@ -146,4 +146,23 @@ ABC_INLINE void copy_yup_from_zup(short yup[3], const short zup[3])
 	yup[2] = -zup[1];
 }
 
+/* *************************** */
+
+#undef ABC_DEBUG_TIME
+
+class ScopeTimer {
+	const char *m_message;
+	double m_start;
+
+public:
+	ScopeTimer(const char *message);
+	~ScopeTimer();
+};
+
+#ifdef ABC_DEBUG_TIME
+#	define SCOPE_TIMER(message) ScopeTimer prof(message)
+#else
+#	define SCOPE_TIMER(message)
+#endif
+
 #endif  /* __ABC_UTIL_H__ */
diff --git a/source/blender/alembic/intern/alembic_capi.cc b/source/blender/alembic/intern/alembic_capi.cc
index d8d017119b1..dc5146a26e0 100644
--- a/source/blender/alembic/intern/alembic_capi.cc
+++ b/source/blender/alembic/intern/alembic_capi.cc
@@ -542,6 +542,8 @@ ABC_INLINE bool is_mesh_and_strands(const IObject &object)
 
 static void import_startjob(void *user_data, short *stop, short *do_update, float *progress)
 {
+	SCOPE_TIMER("Alembic import, objects reading and creation");
+
 	ImportJobData *data = static_cast<ImportJobData *>(user_data);
 
 	data->stop = stop;
@@ -677,6 +679,8 @@ static void import_startjob(void *user_data, short *stop, short *do_update, floa
 
 static void import_endjob(void *user_data)
 {
+	SCOPE_TIMER("Alembic import, cleanup");
+
 	ImportJobData *data = static_cast<ImportJobData *>(user_data);
 
 	std::vector<AbcObjectReader *>::iterator iter;
diff --git a/source/blender/blenkernel/BKE_bvhutils.h b/source/blender/blenkernel/BKE_bvhutils.h
index bf45a27e51c..cb72f0859d5 100644
--- a/source/blender/blenkernel/BKE_bvhutils.h
+++ b/source/blender/blenkernel/BKE_bvhutils.h
@@ -54,7 +54,6 @@ typedef struct BVHTreeFromEditMesh {
 	/* default callbacks to bvh nearest and raycast */
 	BVHTree_NearestPointCallback nearest_callback;
 	BVHTree_RayCastCallback raycast_callback;
-	BVHTree_NearestToRayCallback nearest_to_ray_callback;
 
 	struct BMEditMesh *em;
 
@@ -75,7 +74,6 @@ typedef struct BVHTreeFromMesh {
 	/* default callbacks to bvh nearest and raycast */
 	BVHTree_NearestPointCallback nearest_callback;
 	BVHTree_RayCastCallback raycast_callback;
-	BVHTree_NearestToRayCallback nearest_to_ray_callback;
 
 	/* Vertex array, so that callbacks have instante access to data */
 	const struct MVert *vert;
@@ -104,7 +102,7 @@ typedef struct BVHTreeFromMesh {
  * The tree is build in mesh space coordinates, this means special care must be made on queries
  * so that the coordinates and rays are first translated on the mesh local coordinates.
  * Reason for this is that bvh_from_mesh_* can use a cache in some cases and so it becomes possible to reuse a BVHTree.
- * 
+ *
  * free_bvhtree_from_mesh should be called when the tree is no longer needed.
  */
 BVHTree *bvhtree_from_editmesh_verts(
@@ -118,7 +116,7 @@ BVHTree *bvhtree_from_editmesh_verts_ex(
 BVHTree *bvhtree_from_mesh_verts(
         struct BVHTreeFromMesh *data, struct DerivedMesh *mesh, float epsilon, int tree_type, int axis);
 BVHTree *bvhtree_from_mesh_verts_ex(
-        struct BVHTreeFromMesh *data, struct MVert *vert, const int numVerts,
+        struct BVHTreeFromMesh *data, const struct MVert *vert, const int numVerts,
         const bool vert_allocated, const BLI_bitmap *mask, int verts_num_active,
         float epsilon, int tree_type, int axis);
 
@@ -135,8 +133,8 @@ BVHTree *bvhtree_from_mesh_edges(
         float epsilon, int tree_type, int axis);
 BVHTree *bvhtree_from_mesh_edges_ex(
         struct BVHTreeFromMesh *data,
-        struct MVert *vert, const bool vert_allocated,
-        struct MEdge *edge, const int edges_num, const bool edge_allocated,
+        const struct MVert *vert, const bool vert_allocated,
+        const struct MEdge *edge, const int edges_num, const bool edge_allocated,
         const BLI_bitmap *edges_mask, int edges_num_active,
         float epsilon, int tree_type, int axis);
 
@@ -145,8 +143,8 @@ BVHTree *bvhtree_from_mesh_faces(
         int tree_type, int axis);
 BVHTree *bvhtree_from_mesh_faces_ex(
         struct BVHTreeFromMesh *data,
-        struct MVert *vert, const bool vert_allocated,
-        struct MFace *face, const int numFaces, const bool face_allocated,
+        const struct MVert *vert, const bool vert_allocated,
+        const struct MFace *face, const int numFaces, const bool face_allocated,
         const BLI_bitmap *mask, int numFaces_active,
         float epsilon, int tree_type, int axis);
 
diff --git a/source/blender/blenkernel/BKE_curve.h b/source/blender/blenkernel/BKE_curve.h
index 5558786d254..e111bd0e16b 100644
--- a/source/blender/blenkernel/BKE_curve.h
+++ b/source/blender/blenkernel/BKE_curve.h
@@ -36,6 +36,7 @@
 struct BezTriple;
 struct Curve;
 struct EditNurb;
+struct GHash;
 struct ListBase;
 struct Main;
 struct Nurb;
@@ -52,6 +53,13 @@ typedef struct CurveCache {
 	struct Path *path;
 } CurveCache;
 
+/* Definitions needed for shape keys */
+typedef struct CVKeyIndex {
+	void *orig_cv;
+	int key_index, nu_index, pt_index, vertex_index;
+	bool switched;
+} CVKeyIndex;
+
 #define KNOTSU(nu)      ( (nu)->orderu + (nu)->pntsu + (((nu)->flagu & CU_NURB_CYCLIC) ? ((nu)->orderu - 1) : 0) )
 #define KNOTSV(nu)      ( (nu)->orderv + (nu)->pntsv + (((nu)->flagv & CU_NURB_CYCLIC) ? ((nu)->orderv - 1) : 0) )
 
@@ -108,7 +116,8 @@ void BK_curve_nurbs_vertexCos_apply(struct ListBase *lb, float (*vertexCos)[3]);
 float (*BKE_curve_nurbs_keyVertexCos_get(struct ListBase *lb, float *key))[3];
 void BKE_curve_nurbs_keyVertexTilts_apply(struct ListBase *lb, float *key);
 
-void BKE_curve_editNurb_keyIndex_free(struct EditNurb *editnurb);
+void BKE_curve_editNurb_keyIndex_delCV(struct GHash *keyindex, const void *cv);
+void BKE_curve_editNurb_keyIndex_free(struct GHash **keyindex);
 void BKE_curve_editNurb_free(struct Curve *cu);
 struct ListBase *BKE_curve_editNurbs_get(struct Curve *cu);
 
diff --git a/source/blender/blenkernel/BKE_mesh.h b/source/blender/blenkernel/BKE_mesh.h
index d41878825bb..b83bec5a302 100644
--- a/source/blender/blenkernel/BKE_mesh.h
+++ b/source/blender/blenkernel/BKE_mesh.h
@@ -131,8 +131,7 @@ bool BKE_mesh_uv_cdlayer_rename(struct Mesh *me, const char *old_name, const cha
 
 float (*BKE_mesh_vertexCos_get(const struct Mesh *me, int *r_numVerts))[3];
 
-void BKE_mesh_calc_normals_split(struct Mesh *mesh);
-void BKE_mesh_split_faces(struct Mesh *mesh);
+void BKE_mesh_split_faces(struct Mesh *mesh, bool free_loop_normals);
 
 struct Mesh *BKE_mesh_new_from_object(struct Main *bmain, struct Scene *sce, struct Object *ob,
                                       int apply_modifiers, int settings, int calc_tessface, int calc_undeformed);
@@ -228,6 +227,9 @@ void BKE_lnor_space_custom_normal_to_data(MLoopNorSpace *lnor_space, const float
 
 bool BKE_mesh_has_custom_loop_normals(struct Mesh *me);
 
+void BKE_mesh_calc_normals_split(struct Mesh *mesh);
+void BKE_mesh_calc_normals_split_ex(struct Mesh *mesh, struct MLoopNorSpaceArray *r_lnors_spacearr);
+
 void BKE_mesh_normals_loop_split(
         const struct MVert *mverts, const int numVerts, struct MEdge *medges, const int numEdges,
         struct MLoop *mloops, float (*r_loopnors)[3], const int numLoops,
diff --git a/source/blender/blenkernel/BKE_object.h b/source/blender/blenkernel/BKE_object.h
index d812ab832a1..89adbc4338f 100644
--- a/source/blender/blenkernel/BKE_object.h
+++ b/source/blender/blenkernel/BKE_object.h
@@ -139,8 +139,6 @@ void BKE_boundbox_init_from_minmax(struct BoundBox *bb, const float min[3], cons
 void BKE_boundbox_calc_center_aabb(const struct BoundBox *bb, float r_cent[3]);
 void BKE_boundbox_calc_size_aabb(const struct BoundBox *bb, float r_size[3]);
 void BKE_boundbox_minmax(const struct BoundBox *bb, float obmat[4][4], float r_min[3], float r_max[3]);
-struct BoundBox *BKE_boundbox_ensure_minimum_dimensions(
-        struct BoundBox *bb, struct BoundBox *bb_temp, const float epsilon);
 
 struct BoundBox *BKE_object_boundbox_get(struct Object *ob);
 void BKE_object_dimensions_get(struct Object *ob, float vec[3]);
diff --git a/source/blender/blenkernel/intern/appdir.c b/source/blender/blenkernel/intern/appdir.c
index b1dcc40279f..f2f0a92d8b3 100644
--- a/source/blender/blenkernel/intern/appdir.c
+++ b/source/blender/blenkernel/intern/appdir.c
@@ -121,7 +121,7 @@ static bool test_path(char *targetpath, const char *path_base, const char *path_
 	if (path_sep) BLI_join_dirfile(tmppath, sizeof(tmppath), path_base, path_sep);
 	else BLI_strncpy(tmppath, path_base, sizeof(tmppath));
 
-	/* rare cases folder_name is omitted (when looking for ~/.blender/2.xx dir only) */
+	/* rare cases folder_name is omitted (when looking for ~/.config/blender/2.xx dir only) */
 	if (folder_name)
 		BLI_make_file_string("/", targetpath, tmppath, folder_name);
 	else
@@ -755,7 +755,6 @@ static void where_is_temp(char *fullname, char *basename, const size_t maxlen, c
 void BKE_tempdir_init(char *userdir)
 {
 	where_is_temp(btempdir_session, btempdir_base, FILE_MAX, userdir);
-;
 }
 
 /**
diff --git a/source/blender/blenkernel/intern/bvhutils.c b/source/blender/blenkernel/intern/bvhutils.c
index d0e0c82e3be..c0e4ef37a93 100644
--- a/source/blender/blenkernel/intern/bvhutils.c
+++ b/source/blender/blenkernel/intern/bvhutils.c
@@ -376,45 +376,6 @@ static void mesh_edges_spherecast(void *userdata, int index, const BVHTreeRay *r
 	}
 }
 
-#define V3_MUL_ELEM(a, b) \
-	(a)[0] * (b)[0], \
-	(a)[1] * (b)[1], \
-	(a)[2] * (b)[2]
-
-/* Callback to bvh tree nearest edge to ray.
- * The tree must have been built using bvhtree_from_mesh_edges.
- * userdata must be a BVHMeshCallbackUserdata built from the same mesh as the tree. */
-static void mesh_edges_nearest_to_ray(
-        void *userdata, const float ray_co[3], const float ray_dir[3],
-        const float scale[3], int index, BVHTreeNearest *nearest)
-{
-	struct BVHTreeFromMesh *data = userdata;
-	const MVert *vert = data->vert;
-	const MEdge *e = &data->edge[index];
-
-	const float t0[3]        = {V3_MUL_ELEM(vert[e->v1].co, scale)};
-	const float t1[3]        = {V3_MUL_ELEM(vert[e->v2].co, scale)};
-	const float origin_sc[3] = {V3_MUL_ELEM(ray_co, scale)};
-	const float dir_sc[3]    = {V3_MUL_ELEM(ray_dir, scale)};
-
-	float depth, point[3];
-	const float dist_sq = dist_squared_ray_to_seg_v3(origin_sc, dir_sc, t0, t1, point, &depth);
-
-	if (dist_sq < nearest->dist_sq) {
-		nearest->dist_sq = dist_sq;
-		nearest->index = index;
-
-		point[0] /= scale[0];
-		point[1] /= scale[1];
-		point[2] /= scale[2];
-
-		copy_v3_v3(nearest->co, point);
-		sub_v3_v3v3(nearest->no, t0, t1);
-	}
-}
-
-#undef V3_MUL_ELEM
-
 /** \} */
 
 /*
@@ -459,7 +420,7 @@ static BVHTree *bvhtree_from_editmesh_verts_create_tree(
 
 static BVHTree *bvhtree_from_mesh_verts_create_tree(
         float epsilon, int tree_type, int axis,
-        MVert *vert, const int verts_num,
+        const MVert *vert, const int verts_num,
         const BLI_bitmap *verts_mask, int verts_num_active)
 {
 	BLI_assert(vert != NULL);
@@ -488,31 +449,23 @@ static BVHTree *bvhtree_from_mesh_verts_create_tree(
 
 static void bvhtree_from_mesh_verts_setup_data(
         BVHTreeFromMesh *data, BVHTree *tree, const bool is_cached, float epsilon,
-        MVert *vert, const bool vert_allocated)
+        const MVert *vert, const bool vert_allocated)
 {
 	memset(data, 0, sizeof(*data));
 
-	if (tree) {
-		data->tree = tree;
-		data->cached = is_cached;
+	data->tree = tree;
+	data->cached = is_cached;
 
-		/* a NULL nearest callback works fine
-		 * remember the min distance to point is the same as the min distance to BV of point */
-		data->nearest_callback = NULL;
-		data->raycast_callback = mesh_verts_spherecast;
-		data->nearest_to_ray_callback = NULL;
+	/* a NULL nearest callback works fine
+	 * remember the min distance to point is the same as the min distance to BV of point */
+	data->nearest_callback = NULL;
+	data->raycast_callback = mesh_verts_spherecast;
 
-		data->vert = vert;
-		data->vert_allocated = vert_allocated;
-		//data->face = DM_get_tessface_array(dm, &data->face_allocated);  /* XXX WHY???? */
+	data->vert = vert;
+	data->vert_allocated = vert_allocated;
+	//data->face = DM_get_tessface_array(dm, &data->face_allocated);  /* XXX WHY???? */
 
-		data->sphere_radius = epsilon;
-	}
-	else {
-		if (vert_allocated) {
-			MEM_freeN(vert);
-		}
-	}
+	data->sphere_radius = epsilon;
 }
 
 /* Builds a bvh tree where nodes are the vertices of the given em */
@@ -531,7 +484,6 @@ BVHTree *bvhtree_from_editmesh_verts_ex(
 		data->em = em;
 		data->nearest_callback = NULL;
 		data->raycast_callback = editmesh_verts_spherecast;
-		data->nearest_to_ray_callback = NULL;
 	}
 
 	return tree;
@@ -588,11 +540,18 @@ BVHTree *bvhtree_from_mesh_verts(
 		/* printf("BVHTree is already build, using cached tree\n"); */
 	}
 
-	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_verts_setup_data(
-	        data, tree, true, epsilon, vert, vert_allocated);
-
-	return data->tree;
+	if (tree) {
+		/* Setup BVHTreeFromMesh */
+		bvhtree_from_mesh_verts_setup_data(
+		        data, tree, true, epsilon, vert, vert_allocated);
+	}
+	else {
+		if (vert_allocated) {
+			MEM_freeN(vert);
+		}
+		memset(data, 0, sizeof(*data));
+	}
+	return tree;
 }
 
 /**
@@ -602,7 +561,7 @@ BVHTree *bvhtree_from_mesh_verts(
  * \param verts_num_active if >= 0, number of active verts to add to BVH tree (else will be computed from mask).
  */
 BVHTree *bvhtree_from_mesh_verts_ex(
-        BVHTreeFromMesh *data, MVert *vert, const int verts_num, const bool vert_allocated,
+        BVHTreeFromMesh *data, const MVert *vert, const int verts_num, const bool vert_allocated,
         const BLI_bitmap *verts_mask, int verts_num_active,
         float epsilon, int tree_type, int axis)
 {
@@ -613,7 +572,7 @@ BVHTree *bvhtree_from_mesh_verts_ex(
 	bvhtree_from_mesh_verts_setup_data(
 	        data, tree, false, epsilon, vert, vert_allocated);
 
-	return data->tree;
+	return tree;
 }
 
 /** \} */
@@ -661,7 +620,7 @@ static BVHTree *bvhtree_from_editmesh_edges_create_tree(
 }
 
 static BVHTree *bvhtree_from_mesh_edges_create_tree(
-        MVert *vert, MEdge *edge, const int edge_num,
+        const MVert *vert, const MEdge *edge, const int edge_num,
         const BLI_bitmap *edges_mask, int edges_num_active,
         float epsilon, int tree_type, int axis)
 {
@@ -694,34 +653,26 @@ static BVHTree *bvhtree_from_mesh_edges_create_tree(
 }
 
 static void bvhtree_from_mesh_edges_setup_data(
-        BVHTreeFromMesh *data, BVHTree *tree, const bool is_cached, float epsilon,
-        MVert *vert, const bool vert_allocated, MEdge *edge, const bool edge_allocated)
+        BVHTreeFromMesh *data, BVHTree *tree,
+        const bool is_cached, float epsilon,
+        const MVert *vert, const bool vert_allocated,
+        const MEdge *edge, const bool edge_allocated)
 {
 	memset(data, 0, sizeof(*data));
+
 	data->tree = tree;
 
-	if (data->tree) {
-		data->cached = is_cached;
+	data->cached = is_cached;
 
-		data->nearest_callback = mesh_edges_nearest_point;
-		data->raycast_callback = mesh_edges_spherecast;
-		data->nearest_to_ray_callback = mesh_edges_nearest_to_ray;
+	data->nearest_callback = mesh_edges_nearest_point;
+	data->raycast_callback = mesh_edges_spherecast;
 
-		data->vert = vert;
-		data->vert_allocated = vert_allocated;
-		data->edge = edge;
-		data->edge_allocated = edge_allocated;
+	data->vert = vert;
+	data->vert_allocated = vert_allocated;
+	data->edge = edge;
+	data->edge_allocated = edge_allocated;
 
-		data->sphere_radius = epsilon;
-	}
-	else {
-		if (vert_allocated) {
-			MEM_freeN(vert);
-		}
-		if (edge_allocated) {
-			MEM_freeN(edge);
-		}
-	}
+	data->sphere_radius = epsilon;
 }
 
 /* Builds a bvh tree where nodes are the edges of the given em */
@@ -742,8 +693,6 @@ BVHTree *bvhtree_from_editmesh_edges_ex(
 		data->em = em;
 		data->nearest_callback = NULL;  /* TODO */
 		data->raycast_callback = NULL;  /* TODO */
-		/* TODO: not urgent however since users currently define own callbacks */
-		data->nearest_to_ray_callback = NULL;
 	}
 
 	return tree;
@@ -795,11 +744,21 @@ BVHTree *bvhtree_from_mesh_edges(
 		/* printf("BVHTree is already build, using cached tree\n"); */
 	}
 
-	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_edges_setup_data(
-	        data, tree, true, epsilon, vert, vert_allocated, edge, edge_allocated);
-
-	return data->tree;
+	if (tree) {
+		/* Setup BVHTreeFromMesh */
+		bvhtree_from_mesh_edges_setup_data(
+		        data, tree, true, epsilon, vert, vert_allocated, edge, edge_allocated);
+	}
+	else {
+		if (vert_allocated) {
+			MEM_freeN(vert);
+		}
+		if (edge_allocated) {
+			MEM_freeN(edge);
+		}
+		memset(data, 0, sizeof(*data));
+	}
+	return tree;
 }
 
 /**
@@ -810,8 +769,8 @@ BVHTree *bvhtree_from_mesh_edges(
  */
 BVHTree *bvhtree_from_mesh_edges_ex(
         BVHTreeFromMesh *data,
-        MVert *vert, const bool vert_allocated,
-        MEdge *edge, const int edges_num, const bool edge_allocated,
+        const MVert *vert, const bool vert_allocated,
+        const MEdge *edge, const int edges_num, const bool edge_allocated,
         const BLI_bitmap *edges_mask, int edges_num_active,
         float epsilon, int tree_type, int axis)
 {
@@ -823,7 +782,7 @@ BVHTree *bvhtree_from_mesh_edges_ex(
 	bvhtree_from_mesh_edges_setup_data(
 	        data, tree, false, epsilon, vert, vert_allocated, edge, edge_allocated);
 
-	return data->tree;
+	return tree;
 }
 
 /** \} */
@@ -836,7 +795,7 @@ BVHTree *bvhtree_from_mesh_edges_ex(
 
 static BVHTree *bvhtree_from_mesh_faces_create_tree(
         float epsilon, int tree_type, int axis,
-        MVert *vert, MFace *face, const int faces_num,
+        const MVert *vert, const MFace *face, const int faces_num,
         const BLI_bitmap *faces_mask, int faces_num_active)
 {
 	BVHTree *tree = NULL;
@@ -880,34 +839,23 @@ static BVHTree *bvhtree_from_mesh_faces_create_tree(
 
 static void bvhtree_from_mesh_faces_setup_data(
         BVHTreeFromMesh *data, BVHTree *tree, const bool is_cached, float epsilon,
-        MVert *vert, const bool vert_allocated,
-        MFace *face, const bool face_allocated)
+        const MVert *vert, const bool vert_allocated,
+        const MFace *face, const bool face_allocated)
 {
 	memset(data, 0, sizeof(*data));
 
-	if (tree) {
-		data->tree = tree;
-		data->cached = is_cached;
+	data->tree = tree;
+	data->cached = is_cached;
 
-		data->nearest_callback = mesh_faces_nearest_point;
-		data->raycast_callback = mesh_faces_spherecast;
-		data->nearest_to_ray_callback = NULL;
+	data->nearest_callback = mesh_faces_nearest_point;
+	data->raycast_callback = mesh_faces_spherecast;
 
-		data->vert = vert;
-		data->vert_allocated = vert_allocated;
-		data->face = face;
-		data->face_allocated = face_allocated;
+	data->vert = vert;
+	data->vert_allocated = vert_allocated;
+	data->face = face;
+	data->face_allocated = face_allocated;
 
-		data->sphere_radius = epsilon;
-	}
-	else {
-		if (vert_allocated) {
-			MEM_freeN(vert);
-		}
-		if (face_allocated) {
-			MEM_freeN(face);
-		}
-	}
+	data->sphere_radius = epsilon;
 }
 
 /* Builds a bvh tree where nodes are the tesselated faces of the given dm */
@@ -950,10 +898,21 @@ BVHTree *bvhtree_from_mesh_faces(
 		/* printf("BVHTree is already build, using cached tree\n"); */
 	}
 
-	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_faces_setup_data(data, tree, true, epsilon, vert, vert_allocated, face, face_allocated);
-
-	return data->tree;
+	if (tree) {
+		/* Setup BVHTreeFromMesh */
+		bvhtree_from_mesh_faces_setup_data(
+		        data, tree, true, epsilon, vert, vert_allocated, face, face_allocated);
+	}
+	else {
+		if (vert_allocated) {
+			MEM_freeN(vert);
+		}
+		if (face_allocated) {
+			MEM_freeN(face);
+		}
+		memset(data, 0, sizeof(*data));
+	}
+	return tree;
 }
 
 /**
@@ -964,8 +923,8 @@ BVHTree *bvhtree_from_mesh_faces(
  * \param numFaces_active if >= 0, number of active faces to add to BVH tree (else will be computed from mask).
  */
 BVHTree *bvhtree_from_mesh_faces_ex(
-        BVHTreeFromMesh *data, MVert *vert, const bool vert_allocated,
-        MFace *face, const int numFaces, const bool face_allocated,
+        BVHTreeFromMesh *data, const MVert *vert, const bool vert_allocated,
+        const MFace *face, const int numFaces, const bool face_allocated,
         const BLI_bitmap *faces_mask, int faces_num_active,
         float epsilon, int tree_type, int axis)
 {
@@ -975,9 +934,10 @@ BVHTree *bvhtree_from_mesh_faces_ex(
 	        faces_mask, faces_num_active);
 
 	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_faces_setup_data(data, tree, false, epsilon, vert, vert_allocated, face, face_allocated);
+	bvhtree_from_mesh_faces_setup_data(
+	        data, tree, false, epsilon, vert, vert_allocated, face, face_allocated);
 
-	return data->tree;
+	return tree;
 }
 
 /** \} */
@@ -1088,34 +1048,20 @@ static void bvhtree_from_mesh_looptri_setup_data(
 {
 	memset(data, 0, sizeof(*data));
 
-	if (tree) {
-		data->tree = tree;
-		data->cached = is_cached;
+	data->tree = tree;
+	data->cached = is_cached;
 
-		data->nearest_callback = mesh_looptri_nearest_point;
-		data->raycast_callback = mesh_looptri_spherecast;
-		data->nearest_to_ray_callback = NULL;
+	data->nearest_callback = mesh_looptri_nearest_point;
+	data->raycast_callback = mesh_looptri_spherecast;
 
-		data->vert = vert;
-		data->vert_allocated = vert_allocated;
-		data->loop = mloop;
-		data->loop_allocated = loop_allocated;
-		data->looptri = looptri;
-		data->looptri_allocated = looptri_allocated;
+	data->vert = vert;
+	data->vert_allocated = vert_allocated;
+	data->loop = mloop;
+	data->loop_allocated = loop_allocated;
+	data->looptri = looptri;
+	data->looptri_allocated = looptri_allocated;
 
-		data->sphere_radius = epsilon;
-	}
-	else {
-		if (vert_allocated) {
-			MEM_freeN((void *)vert);
-		}
-		if (loop_allocated) {
-			MEM_freeN((void *)mloop);
-		}
-		if (looptri_allocated) {
-			MEM_freeN((void *)looptri);
-		}
-	}
+	data->sphere_radius = epsilon;
 }
 
 /**
@@ -1160,7 +1106,6 @@ BVHTree *bvhtree_from_editmesh_looptri_ex(
 		data->tree = tree;
 		data->nearest_callback = editmesh_looptri_nearest_point;
 		data->raycast_callback = editmesh_looptri_spherecast;
-		data->nearest_to_ray_callback = NULL;
 		data->sphere_radius = 0.0f;
 		data->em = em;
 		data->cached = bvhCache != NULL;
@@ -1242,14 +1187,28 @@ BVHTree *bvhtree_from_mesh_looptri(
 		/* printf("BVHTree is already build, using cached tree\n"); */
 	}
 
-	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_looptri_setup_data(
-	        data, tree, true, epsilon,
-	        mvert, vert_allocated,
-	        mloop, loop_allocated,
-	        looptri, looptri_allocated);
+	if (tree) {
+		/* Setup BVHTreeFromMesh */
+		bvhtree_from_mesh_looptri_setup_data(
+		        data, tree, true, epsilon,
+		        mvert, vert_allocated,
+		        mloop, loop_allocated,
+		        looptri, looptri_allocated);
+	}
+	else {
+		if (vert_allocated) {
+			MEM_freeN(mvert);
+		}
+		if (loop_allocated) {
+			MEM_freeN(mloop);
+		}
+		if (looptri_allocated) {
+			MEM_freeN((void *)looptri);
+		}
+		memset(data, 0, sizeof(*data));
+	}
 
-	return data->tree;
+	return tree;
 }
 
 BVHTree *bvhtree_from_mesh_looptri_ex(
@@ -1272,7 +1231,7 @@ BVHTree *bvhtree_from_mesh_looptri_ex(
 	        mloop, loop_allocated,
 	        looptri, looptri_allocated);
 
-	return data->tree;
+	return tree;
 }
 
 /** \} */
@@ -1292,29 +1251,27 @@ void free_bvhtree_from_editmesh(struct BVHTreeFromEditMesh *data)
 /* Frees data allocated by a call to bvhtree_from_mesh_*. */
 void free_bvhtree_from_mesh(struct BVHTreeFromMesh *data)
 {
-	if (data->tree) {
-		if (!data->cached) {
-			BLI_bvhtree_free(data->tree);
-		}
-
-		if (data->vert_allocated) {
-			MEM_freeN((void *)data->vert);
-		}
-		if (data->edge_allocated) {
-			MEM_freeN((void *)data->edge);
-		}
-		if (data->face_allocated) {
-			MEM_freeN((void *)data->face);
-		}
-		if (data->loop_allocated) {
-			MEM_freeN((void *)data->loop);
-		}
-		if (data->looptri_allocated) {
-			MEM_freeN((void *)data->looptri);
-		}
+	if (data->tree && !data->cached) {
+		BLI_bvhtree_free(data->tree);
+	}
 
-		memset(data, 0, sizeof(*data));
+	if (data->vert_allocated) {
+		MEM_freeN((void *)data->vert);
+	}
+	if (data->edge_allocated) {
+		MEM_freeN((void *)data->edge);
+	}
+	if (data->face_allocated) {
+		MEM_freeN((void *)data->face);
 	}
+	if (data->loop_allocated) {
+		MEM_freeN((void *)data->loop);
+	}
+	if (data->looptri_allocated) {
+		MEM_freeN((void *)data->looptri);
+	}
+
+	memset(data, 0, sizeof(*data));
 }
 
 
diff --git a/source/blender/blenkernel/intern/cdderivedmesh.c b/source/blender/blenkernel/intern/cdderivedmesh.c
index 483fa977aff..7042b46330b 100644
--- a/source/blender/blenkernel/intern/cdderivedmesh.c
+++ b/source/blender/blenkernel/intern/cdderivedmesh.c
@@ -2428,8 +2428,12 @@ static DerivedMesh *cddm_copy_ex(DerivedMesh *source,
 	dm->cd_flag = source->cd_flag;
 	dm->dirty = source->dirty;
 
-	/* Tessellation data is never copied, so tag it here. */
-	dm->dirty |= DM_DIRTY_TESS_CDLAYERS;
+	/* Tessellation data is never copied, so tag it here.
+	 * Only tag dirty layers if we really ignored tessellation faces.
+	 */
+	if (!copy_tessface_data) {
+		dm->dirty |= DM_DIRTY_TESS_CDLAYERS;
+	}
 
 	CustomData_copy_data(&source->vertData, &dm->vertData, 0, 0, numVerts);
 	CustomData_copy_data(&source->edgeData, &dm->edgeData, 0, 0, numEdges);
diff --git a/source/blender/blenkernel/intern/curve.c b/source/blender/blenkernel/intern/curve.c
index 90a514781d7..439abb1d593 100644
--- a/source/blender/blenkernel/intern/curve.c
+++ b/source/blender/blenkernel/intern/curve.c
@@ -89,20 +89,33 @@ void BKE_curve_editfont_free(Curve *cu)
 	}
 }
 
-void BKE_curve_editNurb_keyIndex_free(EditNurb *editnurb)
+static void curve_editNurb_keyIndex_cv_free_cb(void *val)
 {
-	if (!editnurb->keyindex) {
+	CVKeyIndex *index = val;
+	MEM_freeN(index->orig_cv);
+	MEM_freeN(val);
+}
+
+void BKE_curve_editNurb_keyIndex_delCV(GHash *keyindex, const void *cv)
+{
+	BLI_assert(keyindex != NULL);
+	BLI_ghash_remove(keyindex, cv, NULL, curve_editNurb_keyIndex_cv_free_cb);
+}
+
+void BKE_curve_editNurb_keyIndex_free(GHash **keyindex)
+{
+	if (!(*keyindex)) {
 		return;
 	}
-	BLI_ghash_free(editnurb->keyindex, NULL, MEM_freeN);
-	editnurb->keyindex = NULL;
+	BLI_ghash_free(*keyindex, NULL, curve_editNurb_keyIndex_cv_free_cb);
+	*keyindex = NULL;
 }
 
 void BKE_curve_editNurb_free(Curve *cu)
 {
 	if (cu->editnurb) {
 		BKE_nurbList_free(&cu->editnurb->nurbs);
-		BKE_curve_editNurb_keyIndex_free(cu->editnurb);
+		BKE_curve_editNurb_keyIndex_free(&cu->editnurb->keyindex);
 		MEM_freeN(cu->editnurb);
 		cu->editnurb = NULL;
 	}
diff --git a/source/blender/blenkernel/intern/depsgraph.c b/source/blender/blenkernel/intern/depsgraph.c
index 294a4ce76b7..678dc92a5f2 100644
--- a/source/blender/blenkernel/intern/depsgraph.c
+++ b/source/blender/blenkernel/intern/depsgraph.c
@@ -544,10 +544,16 @@ static void build_dag_object(DagForest *dag, DagNode *scenenode, Main *bmain, Sc
 									if (ct->tar->type == OB_MESH)
 										node3->customdata_mask |= CD_MASK_MDEFORMVERT;
 								}
-								else if (ELEM(con->type, CONSTRAINT_TYPE_FOLLOWPATH, CONSTRAINT_TYPE_CLAMPTO, CONSTRAINT_TYPE_SPLINEIK))
+								else if (ELEM(con->type, CONSTRAINT_TYPE_FOLLOWPATH,
+								                         CONSTRAINT_TYPE_CLAMPTO,
+								                         CONSTRAINT_TYPE_SPLINEIK,
+								                         CONSTRAINT_TYPE_SHRINKWRAP))
+								{
 									dag_add_relation(dag, node3, node, DAG_RL_DATA_DATA | DAG_RL_OB_DATA, cti->name);
-								else
+								}
+								else {
 									dag_add_relation(dag, node3, node, DAG_RL_OB_DATA, cti->name);
+								}
 							}
 						}
 						
@@ -881,8 +887,12 @@ static void build_dag_object(DagForest *dag, DagNode *scenenode, Main *bmain, Sc
 						if (obt->type == OB_MESH)
 							node2->customdata_mask |= CD_MASK_MDEFORMVERT;
 					}
-					else
+					else if (cti->type == CONSTRAINT_TYPE_SHRINKWRAP) {
+						dag_add_relation(dag, node2, node, DAG_RL_DATA_DATA | DAG_RL_OB_DATA, cti->name);
+					}
+					else {
 						dag_add_relation(dag, node2, node, DAG_RL_OB_OB, cti->name);
+					}
 				}
 				addtoroot = 0;
 			}
diff --git a/source/blender/blenkernel/intern/displist.c b/source/blender/blenkernel/intern/displist.c
index 49db75a0474..f8a9d57f579 100644
--- a/source/blender/blenkernel/intern/displist.c
+++ b/source/blender/blenkernel/intern/displist.c
@@ -819,7 +819,7 @@ static void curve_calc_modifiers_pre(Scene *scene, Object *ob, ListBase *nurb,
 	if (editmode)
 		required_mode |= eModifierMode_Editmode;
 
-	if (cu->editnurb == NULL) {
+	if (!editmode) {
 		keyVerts = BKE_key_evaluate_object(ob, &numVerts);
 
 		if (keyVerts) {
diff --git a/source/blender/blenkernel/intern/effect.c b/source/blender/blenkernel/intern/effect.c
index fe8f5ebdca6..4eee24b378f 100644
--- a/source/blender/blenkernel/intern/effect.c
+++ b/source/blender/blenkernel/intern/effect.c
@@ -848,6 +848,14 @@ static void do_physical_effector(EffectorCache *eff, EffectorData *efd, Effected
 			break;
 		case PFIELD_FORCE:
 			normalize_v3(force);
+			if (pd->flag & PFIELD_GRAVITATION){ /* Option: Multiply by 1/distance^2 */
+				if (efd->distance < FLT_EPSILON){
+					strength = 0.0f;
+				}
+				else {
+					strength *= powf(efd->distance, -2.0f);
+				}
+			}
 			mul_v3_fl(force, strength * efd->falloff);
 			break;
 		case PFIELD_VORTEX:
diff --git a/source/blender/blenkernel/intern/library_remap.c b/source/blender/blenkernel/intern/library_remap.c
index a408b498f18..b6f4621a0b3 100644
--- a/source/blender/blenkernel/intern/library_remap.c
+++ b/source/blender/blenkernel/intern/library_remap.c
@@ -179,6 +179,7 @@ static int foreach_libblock_remap_callback(void *user_data, ID *id_self, ID **id
 		 *       on the other hand since they get reset to lib data on file open/reload it is indirect too...
 		 *       Edit Mode is also a 'skip direct' case. */
 		const bool is_obj = (GS(id->name) == ID_OB);
+		const bool is_obj_proxy = (is_obj && (((Object *)id)->proxy || ((Object *)id)->proxy_group));
 		const bool is_obj_editmode = (is_obj && BKE_object_is_in_editmode((Object *)id));
 		const bool is_never_null = ((cb_flag & IDWALK_CB_NEVER_NULL) && (new_id == NULL) &&
 		                            (id_remap_data->flag & ID_REMAP_FORCE_NEVER_NULL_USAGE) == 0);
@@ -231,7 +232,7 @@ static int foreach_libblock_remap_callback(void *user_data, ID *id_self, ID **id
 				/* We cannot affect old_id->us directly, LIB_TAG_EXTRAUSER(_SET) are assumed to be set as needed,
 				 * that extra user is processed in final handling... */
 			}
-			if (!is_indirect) {
+			if (!is_indirect || is_obj_proxy) {
 				id_remap_data->status |= ID_REMAP_IS_LINKED_DIRECT;
 			}
 		}
diff --git a/source/blender/blenkernel/intern/mesh.c b/source/blender/blenkernel/intern/mesh.c
index af02e02b017..befe1a4d70e 100644
--- a/source/blender/blenkernel/intern/mesh.c
+++ b/source/blender/blenkernel/intern/mesh.c
@@ -39,7 +39,9 @@
 
 #include "BLI_utildefines.h"
 #include "BLI_math.h"
+#include "BLI_linklist.h"
 #include "BLI_listbase.h"
+#include "BLI_memarena.h"
 #include "BLI_edgehash.h"
 #include "BLI_string.h"
 
@@ -66,6 +68,11 @@
 
 #include "DEG_depsgraph.h"
 
+/* Define for cases when you want extra validation of mesh
+ * after certain modifications.
+ */
+// #undef VALIDATE_MESH
+
 enum {
 	MESHCMP_DVERT_WEIGHTMISMATCH = 1,
 	MESHCMP_DVERT_GROUPMISMATCH,
@@ -2048,7 +2055,7 @@ void BKE_mesh_mselect_active_set(Mesh *me, int index, int type)
 	           (me->mselect[me->totselect - 1].type  == type));
 }
 
-void BKE_mesh_calc_normals_split(Mesh *mesh)
+void BKE_mesh_calc_normals_split_ex(Mesh *mesh, MLoopNorSpaceArray *r_lnors_spacearr)
 {
 	float (*r_loopnors)[3];
 	float (*polynors)[3];
@@ -2083,111 +2090,330 @@ void BKE_mesh_calc_normals_split(Mesh *mesh)
 	BKE_mesh_normals_loop_split(
 	        mesh->mvert, mesh->totvert, mesh->medge, mesh->totedge,
 	        mesh->mloop, r_loopnors, mesh->totloop, mesh->mpoly, (const float (*)[3])polynors, mesh->totpoly,
-	        (mesh->flag & ME_AUTOSMOOTH) != 0, mesh->smoothresh, NULL, clnors, NULL);
+	        (mesh->flag & ME_AUTOSMOOTH) != 0, mesh->smoothresh, r_lnors_spacearr, clnors, NULL);
 
 	if (free_polynors) {
 		MEM_freeN(polynors);
 	}
 }
 
-/* Spli faces based on the edge angle.
- * Matches behavior of face splitting in render engines.
- */
-void BKE_mesh_split_faces(Mesh *mesh)
+void BKE_mesh_calc_normals_split(Mesh *mesh)
 {
-	const int num_verts = mesh->totvert;
-	const int num_edges = mesh->totedge;
-	const int num_polys = mesh->totpoly;
+	BKE_mesh_calc_normals_split_ex(mesh, NULL);
+}
+
+/* Split faces helper functions. */
+
+typedef struct SplitFaceNewVert {
+	struct SplitFaceNewVert *next;
+	int new_index;
+	int orig_index;
+	float *vnor;
+} SplitFaceNewVert;
+
+typedef struct SplitFaceNewEdge {
+	struct SplitFaceNewEdge *next;
+	int new_index;
+	int orig_index;
+	int v1;
+	int v2;
+} SplitFaceNewEdge;
+
+/* Detect needed new vertices, and update accordingly loops' vertex indices.
+ * WARNING! Leaves mesh in invalid state. */
+static int split_faces_prepare_new_verts(
+        const Mesh *mesh, MLoopNorSpaceArray *lnors_spacearr, SplitFaceNewVert **new_verts, MemArena *memarena,
+        bool *r_need_vnors_recalc)
+{
+	/* Note: if lnors_spacearr is NULL, ther is no autosmooth handling, and we only split out flat polys. */
+	const int num_loops = mesh->totloop;
+	int num_verts = mesh->totvert;
 	MVert *mvert = mesh->mvert;
-	MEdge *medge = mesh->medge;
 	MLoop *mloop = mesh->mloop;
-	MPoly *mpoly = mesh->mpoly;
-	float (*lnors)[3];
-	int poly, num_new_verts = 0;
-	if ((mesh->flag & ME_AUTOSMOOTH) == 0) {
-		return;
-	}
-	BKE_mesh_tessface_clear(mesh);
-	/* Compute loop normals if needed. */
-	if (!CustomData_has_layer(&mesh->ldata, CD_NORMAL)) {
-		BKE_mesh_calc_normals_split(mesh);
-	}
-	lnors = CustomData_get_layer(&mesh->ldata, CD_NORMAL);
-	/* Count number of vertices to be split. */
-	for (poly = 0; poly < num_polys; poly++) {
-		MPoly *mp = &mpoly[poly];
-		int loop;
-		for (loop = 0; loop < mp->totloop; loop++) {
-			MLoop *ml = &mloop[mp->loopstart + loop];
-			MVert *mv = &mvert[ml->v];
-			float vn[3];
-			normal_short_to_float_v3(vn, mv->no);
-			if (!equals_v3v3(vn, lnors[mp->loopstart + loop])) {
-				num_new_verts++;
+
+	BLI_bitmap *verts_used = BLI_BITMAP_NEW(num_verts, __func__);
+
+	if (lnors_spacearr) {
+		BLI_bitmap *done_loops = BLI_BITMAP_NEW(num_loops, __func__);
+
+		MLoop *ml = mloop;
+		MLoopNorSpace **lnor_space = lnors_spacearr->lspacearr;
+		for (int loop_idx = 0; loop_idx < num_loops; loop_idx++, ml++, lnor_space++) {
+			if (!BLI_BITMAP_TEST(done_loops, loop_idx)) {
+				const int vert_idx = ml->v;
+				const bool vert_used = BLI_BITMAP_TEST_BOOL(verts_used, vert_idx);
+				/* If vert is already used by another smooth fan, we need a new vert for this one. */
+				const int new_vert_idx = vert_used ? num_verts++ : vert_idx;
+
+				if ((*lnor_space)->loops) {
+					for (LinkNode *lnode = (*lnor_space)->loops; lnode; lnode = lnode->next) {
+						const int ml_fan_idx = GET_INT_FROM_POINTER(lnode->link);
+						BLI_BITMAP_ENABLE(done_loops, ml_fan_idx);
+						if (vert_used) {
+							mloop[ml_fan_idx].v = new_vert_idx;
+						}
+					}
+				}
+				else {
+					/* Single loop in this fan... */
+					BLI_BITMAP_ENABLE(done_loops, loop_idx);
+					if (vert_used) {
+						ml->v = new_vert_idx;
+					}
+				}
+
+				if (!vert_used) {
+					BLI_BITMAP_ENABLE(verts_used, vert_idx);
+					/* We need to update that vertex's normal here, we won't go over it again. */
+					/* This is important! *DO NOT* set vnor to final computed lnor, vnor should always be defined to
+					 * 'automatic normal' value computed from its polys, not some custom normal.
+					 * Fortunately, that's the loop normal space's 'lnor' reference vector. ;) */
+					normal_float_to_short_v3(mvert[vert_idx].no, (*lnor_space)->vec_lnor);
+				}
+				else {
+					/* Add new vert to list. */
+					SplitFaceNewVert *new_vert = BLI_memarena_alloc(memarena, sizeof(*new_vert));
+					new_vert->orig_index = vert_idx;
+					new_vert->new_index = new_vert_idx;
+					new_vert->vnor = (*lnor_space)->vec_lnor;  /* See note above. */
+					new_vert->next = *new_verts;
+					*new_verts = new_vert;
+				}
 			}
 		}
+
+		MEM_freeN(done_loops);
 	}
-	if (num_new_verts == 0) {
-		/* No new vertices are to be added, can do early exit. */
-		return;
-	}
-	/* Reallocate all vert and edge related data. */
-	mesh->totvert += num_new_verts;
-	mesh->totedge += 2 * num_new_verts;
-	CustomData_realloc(&mesh->vdata, mesh->totvert);
-	CustomData_realloc(&mesh->edata, mesh->totedge);
-	/* Update pointers to a newly allocated memory. */
-	BKE_mesh_update_customdata_pointers(mesh, false);
-	mvert = mesh->mvert;
-	medge = mesh->medge;
-	/* Perform actual vertex split. */
-	num_new_verts = 0;
-	for (poly = 0; poly < num_polys; poly++) {
-		MPoly *mp = &mpoly[poly];
-		int loop;
-		for (loop = 0; loop < mp->totloop; loop++) {
-			int poly_loop = mp->loopstart + loop;
-			MLoop *ml = &mloop[poly_loop];
-			MVert *mv = &mvert[ml->v];
-			float vn[3];
-			normal_short_to_float_v3(vn, mv->no);
-			if (!equals_v3v3(vn, lnors[mp->loopstart + loop])) {
-				int poly_loop_prev = mp->loopstart + (loop + mp->totloop - 1) % mp->totloop;
-				MLoop *ml_prev = &mloop[poly_loop_prev];
-				int new_edge_prev, new_edge;
-				/* Cretae new vertex. */
-				int new_vert = num_verts + num_new_verts;
-				CustomData_copy_data(&mesh->vdata, &mesh->vdata,
-				                     ml->v, new_vert, 1);
-				normal_float_to_short_v3(mvert[new_vert].no,
-				                         lnors[poly_loop]);
-				/* Create new edges. */
-				new_edge_prev = num_edges + 2 * num_new_verts;
-				new_edge = num_edges + 2 * num_new_verts + 1;
-				CustomData_copy_data(&mesh->edata, &mesh->edata,
-				                     ml_prev->e, new_edge_prev, 1);
-				CustomData_copy_data(&mesh->edata, &mesh->edata,
-				                     ml->e, new_edge, 1);
-				if (medge[new_edge_prev].v1 == ml->v) {
-					medge[new_edge_prev].v1 = new_vert;
+	else {
+		/* No loop normal spaces available, we only split out flat polys. */
+		const int num_polys = mesh->totpoly;
+		const MPoly *mpoly = mesh->mpoly;
+
+		/* We do that in two loops, to keep original edges/verts to smooth polys preferencially. */
+		const MPoly *mp = mpoly;
+		for (int i = 0; i < num_polys; i++, mp++) {
+			if (mp->flag & ME_SMOOTH) {
+				const MLoop *ml = &mloop[mp->loopstart];
+				for (int j = 0; j < mp->totloop; j++, ml++) {
+					/* Just mark the vertex as used/reserved, that way neighbor flat polys, if any,
+					 * will have to create their own. */
+					BLI_BITMAP_ENABLE(verts_used, ml->v);
 				}
-				else {
-					medge[new_edge_prev].v2 = new_vert;
+			}
+		}
+
+		mp = mpoly;
+		for (int i = 0; i < num_polys; i++, mp++) {
+			if (!(mp->flag & ME_SMOOTH)) {
+				MLoop *ml = &mloop[mp->loopstart];
+				for (int j = 0; j < mp->totloop; j++, ml++) {
+					const int vert_idx = ml->v;
+
+					if (BLI_BITMAP_TEST(verts_used, vert_idx)) {
+						/* Add new vert to list. */
+						const int new_vert_idx = num_verts++;
+						ml->v = new_vert_idx;
+
+						SplitFaceNewVert *new_vert = BLI_memarena_alloc(memarena, sizeof(*new_vert));
+						new_vert->orig_index = vert_idx;
+						new_vert->new_index = new_vert_idx;
+						new_vert->vnor = NULL;  /* See note below about normals. */
+						new_vert->next = *new_verts;
+						*new_verts = new_vert;
+					}
+					else {
+						BLI_BITMAP_ENABLE(verts_used, vert_idx);
+					}
 				}
-				if (medge[new_edge].v1 == ml->v) {
-					medge[new_edge].v1 = new_vert;
+				/* Note: there is no way to get new normals for smooth vertices here (and we don't have direct access
+				 * to poly normals either for flat ones), so we'll have to recompute all vnors at the end... */
+				*r_need_vnors_recalc = true;
+			}
+		}
+	}
+
+	MEM_freeN(verts_used);
+
+	return num_verts - mesh->totvert;
+}
+
+/* Detect needed new edges, and update accordingly loops' edge indices.
+ * WARNING! Leaves mesh in invalid state. */
+static int split_faces_prepare_new_edges(
+        const Mesh *mesh, SplitFaceNewEdge **new_edges, MemArena *memarena)
+{
+	const int num_polys = mesh->totpoly;
+	int num_edges = mesh->totedge;
+	MEdge *medge = mesh->medge;
+	MLoop *mloop = mesh->mloop;
+	const MPoly *mpoly = mesh->mpoly;
+
+	BLI_bitmap *edges_used = BLI_BITMAP_NEW(num_edges, __func__);
+	EdgeHash *edges_hash = BLI_edgehash_new_ex(__func__, num_edges);
+
+	const MPoly *mp = mpoly;
+	for (int poly_idx = 0; poly_idx < num_polys; poly_idx++, mp++) {
+		MLoop *ml_prev = &mloop[mp->loopstart + mp->totloop - 1];
+		MLoop *ml = &mloop[mp->loopstart];
+		for (int loop_idx = 0; loop_idx < mp->totloop; loop_idx++, ml++) {
+			void **eval;
+			if (!BLI_edgehash_ensure_p(edges_hash, ml_prev->v, ml->v, &eval)) {
+				const int edge_idx = ml_prev->e;
+
+				/* That edge has not been encountered yet, define it. */
+				if (BLI_BITMAP_TEST(edges_used, edge_idx)) {
+					/* Original edge has already been used, we need to define a new one. */
+					const int new_edge_idx = num_edges++;
+					*eval = SET_INT_IN_POINTER(new_edge_idx);
+					ml_prev->e = new_edge_idx;
+
+					SplitFaceNewEdge *new_edge = BLI_memarena_alloc(memarena, sizeof(*new_edge));
+					new_edge->orig_index = edge_idx;
+					new_edge->new_index = new_edge_idx;
+					new_edge->v1 = ml_prev->v;
+					new_edge->v2 = ml->v;
+					new_edge->next = *new_edges;
+					*new_edges = new_edge;
 				}
 				else {
-					medge[new_edge].v2 = new_vert;
+					/* We can re-use original edge. */
+					medge[edge_idx].v1 = ml_prev->v;
+					medge[edge_idx].v2 = ml->v;
+					*eval = SET_INT_IN_POINTER(edge_idx);
+					BLI_BITMAP_ENABLE(edges_used, edge_idx);
 				}
-
-				ml->v = new_vert;
-				ml_prev->e = new_edge_prev;
-				ml->e = new_edge;
-				num_new_verts++;
 			}
+			else {
+				/* Edge already known, just update loop's edge index. */
+				ml_prev->e = GET_INT_FROM_POINTER(*eval);
+			}
+
+			ml_prev = ml;
 		}
 	}
+
+	MEM_freeN(edges_used);
+	BLI_edgehash_free(edges_hash, NULL);
+
+	return num_edges - mesh->totedge;
+}
+
+/* Perform actual split of vertices. */
+static void split_faces_split_new_verts(
+        Mesh *mesh, SplitFaceNewVert *new_verts, const int num_new_verts)
+{
+	const int num_verts = mesh->totvert - num_new_verts;
+	MVert *mvert = mesh->mvert;
+
+	/* Remember new_verts is a single linklist, so its items are in reversed order... */
+	MVert *new_mv = &mvert[mesh->totvert - 1];
+	for (int i = mesh->totvert - 1; i >= num_verts ; i--, new_mv--, new_verts = new_verts->next) {
+		BLI_assert(new_verts->new_index == i);
+		BLI_assert(new_verts->new_index != new_verts->orig_index);
+		CustomData_copy_data(&mesh->vdata, &mesh->vdata, new_verts->orig_index, i, 1);
+		if (new_verts->vnor) {
+			normal_float_to_short_v3(new_mv->no, new_verts->vnor);
+		}
+	}
+}
+
+/* Perform actual split of edges. */
+static void split_faces_split_new_edges(
+        Mesh *mesh, SplitFaceNewEdge *new_edges, const int num_new_edges)
+{
+	const int num_edges = mesh->totedge - num_new_edges;
+	MEdge *medge = mesh->medge;
+
+	/* Remember new_edges is a single linklist, so its items are in reversed order... */
+	MEdge *new_med = &medge[mesh->totedge - 1];
+	for (int i = mesh->totedge - 1; i >= num_edges ; i--, new_med--, new_edges = new_edges->next) {
+		BLI_assert(new_edges->new_index == i);
+		BLI_assert(new_edges->new_index != new_edges->orig_index);
+		CustomData_copy_data(&mesh->edata, &mesh->edata, new_edges->orig_index, i, 1);
+		new_med->v1 = new_edges->v1;
+		new_med->v2 = new_edges->v2;
+	}
+}
+
+/* Split faces based on the edge angle and loop normals.
+ * Matches behavior of face splitting in render engines.
+ *
+ * NOTE: Will leave CD_NORMAL loop data layer which is
+ * used by render engines to set shading up.
+ */
+void BKE_mesh_split_faces(Mesh *mesh, bool free_loop_normals)
+{
+	const int num_polys = mesh->totpoly;
+
+	if (num_polys == 0) {
+		return;
+	}
+	BKE_mesh_tessface_clear(mesh);
+
+	MLoopNorSpaceArray *lnors_spacearr = NULL;
+	MemArena *memarena;
+	bool need_vnors_recalc = false;
+
+	if (mesh->flag & ME_AUTOSMOOTH) {
+		lnors_spacearr = MEM_callocN(sizeof(*lnors_spacearr), __func__);
+		/* Compute loop normals and loop normal spaces (a.k.a. smooth fans of faces around vertices). */
+		BKE_mesh_calc_normals_split_ex(mesh, lnors_spacearr);
+		/* Stealing memarena from loop normals space array. */
+		memarena = lnors_spacearr->mem;
+	}
+	else {
+		/* We still have to split out flat faces... */
+		memarena = BLI_memarena_new(BLI_MEMARENA_STD_BUFSIZE, __func__);
+	}
+
+	SplitFaceNewVert *new_verts = NULL;
+	SplitFaceNewEdge *new_edges = NULL;
+
+	/* Detect loop normal spaces (a.k.a. smooth fans) that will need a new vert. */
+	const int num_new_verts = split_faces_prepare_new_verts(mesh, lnors_spacearr, &new_verts, memarena, &need_vnors_recalc);
+
+	if (num_new_verts > 0) {
+		/* Reminder: beyond this point, there is no way out, mesh is in invalid state (due to early-reassignment of
+		 * loops' vertex and edge indices to new, to-be-created split ones). */
+
+		const int num_new_edges = split_faces_prepare_new_edges(mesh, &new_edges, memarena);
+		BLI_assert(num_new_edges > 0);
+
+		/* Reallocate all vert and edge related data. */
+		mesh->totvert += num_new_verts;
+		mesh->totedge += num_new_edges;
+		CustomData_realloc(&mesh->vdata, mesh->totvert);
+		CustomData_realloc(&mesh->edata, mesh->totedge);
+		/* Update pointers to a newly allocated memory. */
+		BKE_mesh_update_customdata_pointers(mesh, false);
+
+		/* Perform actual split of vertices and edges. */
+		split_faces_split_new_verts(mesh, new_verts, num_new_verts);
+		split_faces_split_new_edges(mesh, new_edges, num_new_edges);
+	}
+
+	/* Note: after this point mesh is expected to be valid again. */
+
+	/* CD_NORMAL is expected to be temporary only. */
+	if (free_loop_normals) {
+		CustomData_free_layers(&mesh->ldata, CD_NORMAL, mesh->totloop);
+	}
+
+	if (lnors_spacearr) {
+		/* Also frees new_verts/edges temp data, since we used its memarena to allocate them. */
+		BKE_lnor_spacearr_free(lnors_spacearr);
+		MEM_freeN(lnors_spacearr);
+	}
+	else {
+		BLI_memarena_free(memarena);
+	}
+
+	if (need_vnors_recalc) {
+		BKE_mesh_calc_normals(mesh);
+	}
+#ifdef VALIDATE_MESH
+	BKE_mesh_validate(mesh, true, true);
+#endif
 }
 
 /* settings: 1 - preview, 2 - render */
diff --git a/source/blender/blenkernel/intern/mesh_evaluate.c b/source/blender/blenkernel/intern/mesh_evaluate.c
index f9eba118383..003b7b784d5 100644
--- a/source/blender/blenkernel/intern/mesh_evaluate.c
+++ b/source/blender/blenkernel/intern/mesh_evaluate.c
@@ -1152,7 +1152,6 @@ void BKE_mesh_normals_loop_split(
         const bool use_split_normals, float split_angle,
         MLoopNorSpaceArray *r_lnors_spacearr, short (*clnors_data)[2], int *r_loop_to_poly)
 {
-
 	/* For now this is not supported. If we do not use split normals, we do not generate anything fancy! */
 	BLI_assert(use_split_normals || !(r_lnors_spacearr));
 
diff --git a/source/blender/blenkernel/intern/object.c b/source/blender/blenkernel/intern/object.c
index ff8be5892e9..6e754755cf3 100644
--- a/source/blender/blenkernel/intern/object.c
+++ b/source/blender/blenkernel/intern/object.c
@@ -2236,66 +2236,6 @@ void BKE_boundbox_minmax(const BoundBox *bb, float obmat[4][4], float r_min[3],
 	}
 }
 
-/**
- * Returns a BBox which each dimensions are at least epsilon.
- * \note In case a given dimension needs to be enlarged, its final value will be in [epsilon, 3 * epsilon] range.
- *
- * \param bb the input bbox to check.
- * \param bb_temp the temp bbox to modify (\a bb content is never changed).
- * \param epsilon the minimum dimension to ensure.
- * \return either bb (if nothing needed to be changed) or bb_temp.
- */
-BoundBox *BKE_boundbox_ensure_minimum_dimensions(BoundBox *bb, BoundBox *bb_temp, const float epsilon)
-{
-	if (fabsf(bb->vec[0][0] - bb->vec[4][0]) < epsilon) {
-		/* Flat along X axis... */
-		*bb_temp = *bb;
-		bb = bb_temp;
-		bb->vec[0][0] -= epsilon;
-		bb->vec[1][0] -= epsilon;
-		bb->vec[2][0] -= epsilon;
-		bb->vec[3][0] -= epsilon;
-		bb->vec[4][0] += epsilon;
-		bb->vec[5][0] += epsilon;
-		bb->vec[6][0] += epsilon;
-		bb->vec[7][0] += epsilon;
-	}
-
-	if (fabsf(bb->vec[0][1] - bb->vec[3][1]) < epsilon) {
-		/* Flat along Y axis... */
-		if (bb != bb_temp) {
-			*bb_temp = *bb;
-			bb = bb_temp;
-		}
-		bb->vec[0][1] -= epsilon;
-		bb->vec[1][1] -= epsilon;
-		bb->vec[4][1] -= epsilon;
-		bb->vec[5][1] -= epsilon;
-		bb->vec[2][1] += epsilon;
-		bb->vec[3][1] += epsilon;
-		bb->vec[6][1] += epsilon;
-		bb->vec[7][1] += epsilon;
-	}
-
-	if (fabsf(bb->vec[0][2] - bb->vec[1][2]) < epsilon) {
-		/* Flat along Z axis... */
-		if (bb != bb_temp) {
-			*bb_temp = *bb;
-			bb = bb_temp;
-		}
-		bb->vec[0][2] -= epsilon;
-		bb->vec[3][2] -= epsilon;
-		bb->vec[4][2] -= epsilon;
-		bb->vec[7][2] -= epsilon;
-		bb->vec[1][2] += epsilon;
-		bb->vec[2][2] += epsilon;
-		bb->vec[5][2] += epsilon;
-		bb->vec[6][2] += epsilon;
-	}
-
-	return bb;
-}
-
 BoundBox *BKE_object_boundbox_get(Object *ob)
 {
 	BoundBox *bb = NULL;
diff --git a/source/blender/blenkernel/intern/scene.c b/source/blender/blenkernel/intern/scene.c
index 56bfe5d7ff1..906fa0134a0 100644
--- a/source/blender/blenkernel/intern/scene.c
+++ b/source/blender/blenkernel/intern/scene.c
@@ -1510,8 +1510,6 @@ static void scene_update_object_func(TaskPool * __restrict pool, void *taskdata,
 		if (add_to_stats) {
 			StatisicsEntry *entry;
 
-			BLI_assert(threadid < BLI_pool_get_num_threads(pool));
-
 			entry = MEM_mallocN(sizeof(StatisicsEntry), "update thread statistics");
 			entry->object = object;
 			entry->start_time = start_time;
@@ -1631,10 +1629,11 @@ static bool scene_need_update_objects(Main *bmain)
 
 static void scene_update_objects(EvaluationContext *eval_ctx, Main *bmain, Scene *scene, Scene *scene_parent)
 {
-	TaskScheduler *task_scheduler = BLI_task_scheduler_get();
+	TaskScheduler *task_scheduler;
 	TaskPool *task_pool;
 	ThreadedObjectUpdateState state;
 	bool need_singlethread_pass;
+	bool need_free_scheduler;
 
 	/* Early check for whether we need to invoke all the task-based
 	 * things (spawn new ppol, traverse dependency graph and so on).
@@ -1651,6 +1650,15 @@ static void scene_update_objects(EvaluationContext *eval_ctx, Main *bmain, Scene
 	state.scene = scene;
 	state.scene_parent = scene_parent;
 
+	if (G.debug & G_DEBUG_DEPSGRAPH_NO_THREADS) {
+		task_scheduler = BLI_task_scheduler_create(1);
+		need_free_scheduler = true;
+	}
+	else {
+		task_scheduler = BLI_task_scheduler_get();
+		need_free_scheduler = false;
+	}
+
 	/* Those are only needed when blender is run with --debug argument. */
 	if (G.debug & G_DEBUG_DEPSGRAPH) {
 		const int tot_thread = BLI_task_scheduler_num_threads(task_scheduler);
@@ -1665,9 +1673,6 @@ static void scene_update_objects(EvaluationContext *eval_ctx, Main *bmain, Scene
 #endif
 
 	task_pool = BLI_task_pool_create(task_scheduler, &state);
-	if (G.debug & G_DEBUG_DEPSGRAPH_NO_THREADS) {
-		BLI_pool_set_num_threads(task_pool, 1);
-	}
 
 	DAG_threaded_update_begin(scene, scene_update_object_add_task, task_pool);
 	BLI_task_pool_work_and_wait(task_pool);
@@ -1700,6 +1705,10 @@ static void scene_update_objects(EvaluationContext *eval_ctx, Main *bmain, Scene
 	if (need_singlethread_pass) {
 		scene_update_all_bases(eval_ctx, scene, scene_parent);
 	}
+
+	if (need_free_scheduler) {
+		BLI_task_scheduler_free(task_scheduler);
+	}
 }
 
 static void scene_update_tagged_recursive(EvaluationContext *eval_ctx, Main *bmain, Scene *scene, Scene *scene_parent)
diff --git a/source/blender/blenkernel/intern/tracking_stabilize.c b/source/blender/blenkernel/intern/tracking_stabilize.c
index 36b24fbb2dc..722fc89a75f 100644
--- a/source/blender/blenkernel/intern/tracking_stabilize.c
+++ b/source/blender/blenkernel/intern/tracking_stabilize.c
@@ -1167,7 +1167,8 @@ static void stabilization_calculate_data(StabContext *ctx,
 
 	if (ctx->stab->flag & TRACKING_STABILIZE_SCALE) {
 		*r_scale = expf(scale_step * scaleinf);  /* Averaged in log scale */
-	} else {
+	}
+	else {
 		*r_scale = 1.0f;
 	}
 
@@ -1180,8 +1181,8 @@ static void stabilization_calculate_data(StabContext *ctx,
 	 */
 	get_animated_target_pos(ctx, framenr, target_pos);
 	sub_v2_v2(r_translation, target_pos);
-	*r_angle -= get_animated_target_rot(ctx,framenr);
-	target_scale = get_animated_target_scale(ctx,framenr);
+	*r_angle -= get_animated_target_rot(ctx, framenr);
+	target_scale = get_animated_target_scale(ctx, framenr);
 	if (target_scale != 0.0f) {
 		*r_scale /= target_scale;
 		/* target_scale is an expected/intended reference zoom value */
diff --git a/source/blender/blenlib/BLI_kdopbvh.h b/source/blender/blenlib/BLI_kdopbvh.h
index 91d39801645..ba565fca522 100644
--- a/source/blender/blenlib/BLI_kdopbvh.h
+++ b/source/blender/blenlib/BLI_kdopbvh.h
@@ -95,10 +95,6 @@ typedef void (*BVHTree_NearestPointCallback)(void *userdata, int index, const fl
 /* callback must update hit in case it finds a nearest successful hit */
 typedef void (*BVHTree_RayCastCallback)(void *userdata, int index, const BVHTreeRay *ray, BVHTreeRayHit *hit);
 
-/* callback must update nearest in case it finds a nearest result */
-typedef void (*BVHTree_NearestToRayCallback)(void *userdata, const float ray_co[3], const float ray_dir[3],
-                                             const float scale[3], int index, BVHTreeNearest *nearest);
-
 /* callback to check if 2 nodes overlap (use thread if intersection results need to be stored) */
 typedef bool (*BVHTree_OverlapCallback)(void *userdata, int index_a, int index_b, int thread);
 
@@ -143,18 +139,6 @@ int BLI_bvhtree_find_nearest(
         BVHTree *tree, const float co[3], BVHTreeNearest *nearest,
         BVHTree_NearestPointCallback callback, void *userdata);
 
-int BLI_bvhtree_find_nearest_to_ray_angle(
-        BVHTree *tree, const float co[3], const float dir[3],
-        const bool ray_is_normalized, const float scale[3],
-        BVHTreeNearest *nearest,
-        BVHTree_NearestToRayCallback callback, void *userdata);
-
-int BLI_bvhtree_find_nearest_to_ray(
-        BVHTree *tree, const float co[3], const float dir[3],
-        const bool ray_is_normalized, const float scale[3],
-        BVHTreeNearest *nearest,
-        BVHTree_NearestToRayCallback callback, void *userdata);
-
 int BLI_bvhtree_ray_cast_ex(
         BVHTree *tree, const float co[3], const float dir[3], float radius, BVHTreeRayHit *hit,
         BVHTree_RayCastCallback callback, void *userdata,
diff --git a/source/blender/blenlib/BLI_math_geom.h b/source/blender/blenlib/BLI_math_geom.h
index 4a85e859c16..f1d9c9571f2 100644
--- a/source/blender/blenlib/BLI_math_geom.h
+++ b/source/blender/blenlib/BLI_math_geom.h
@@ -298,23 +298,6 @@ bool isect_ray_aabb_v3_simple(
         const float bb_min[3], const float bb_max[3],
         float *tmin, float *tmax);
 
-struct NearestRayToAABB_Precalc {
-	float ray_origin[3];
-	float ray_direction[3];
-	float ray_inv_dir[3];
-	float cdot_axis[3];
-	float idiag_sq[3];
-	bool sign[3];
-};
-
-void dist_squared_ray_to_aabb_v3_precalc(
-        struct NearestRayToAABB_Precalc *data,
-        const float ray_origin[3], const float ray_direction[3]);
-float dist_squared_ray_to_aabb_v3(
-        const struct NearestRayToAABB_Precalc *data,
-        const float bb_min[3], const float bb_max[3],
-        bool r_axis_closest[3]);
-
 /* other */
 bool isect_sweeping_sphere_tri_v3(const float p1[3], const float p2[3], const float radius,
                                   const float v0[3], const float v1[3], const float v2[3], float *r_lambda, float ipoint[3]);
diff --git a/source/blender/blenlib/BLI_rect.h b/source/blender/blenlib/BLI_rect.h
index 59bf3644912..041679ef876 100644
--- a/source/blender/blenlib/BLI_rect.h
+++ b/source/blender/blenlib/BLI_rect.h
@@ -47,6 +47,8 @@ bool BLI_rcti_is_empty(const struct rcti *rect);
 bool BLI_rctf_is_empty(const struct rctf *rect);
 void BLI_rctf_init(struct rctf *rect, float xmin, float xmax, float ymin, float ymax);
 void BLI_rcti_init(struct rcti *rect, int xmin, int xmax, int ymin, int ymax);
+void BLI_rctf_init_pt_radius(struct rctf *rect, const float xy[2], float size);
+void BLI_rcti_init_pt_radius(struct rcti *rect, const int xy[2], int size);
 void BLI_rcti_init_minmax(struct rcti *rect);
 void BLI_rctf_init_minmax(struct rctf *rect);
 void BLI_rcti_do_minmax_v(struct rcti *rect, const int xy[2]);
diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index 967e0be6d0a..c3c587275e1 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -81,6 +81,7 @@ typedef void (*TaskFreeFunction)(TaskPool *__restrict pool, void *taskdata, int
 
 TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata);
 TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata);
+TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata);
 void BLI_task_pool_free(TaskPool *pool);
 
 void BLI_task_pool_push_ex(
@@ -95,14 +96,6 @@ void BLI_task_pool_push_from_thread(TaskPool *pool, TaskRunFunction run,
 void BLI_task_pool_work_and_wait(TaskPool *pool);
 /* cancel all tasks, keep worker threads running */
 void BLI_task_pool_cancel(TaskPool *pool);
-/* stop all worker threads */
-void BLI_task_pool_stop(TaskPool *pool);
-
-/* get number of threads allowed to be used by this pool */
-int BLI_pool_get_num_threads(TaskPool *pool);
-
-/* set number of threads allowed to be used by this pool */
-void BLI_pool_set_num_threads(TaskPool *pool, int num_threads);
 
 /* for worker threads, test if canceled */
 bool BLI_task_pool_canceled(TaskPool *pool);
@@ -113,9 +106,6 @@ void *BLI_task_pool_userdata(TaskPool *pool);
 /* optional mutex to use from run function */
 ThreadMutex *BLI_task_pool_user_mutex(TaskPool *pool);
 
-/* number of tasks done, for stats, don't use this to make decisions */
-size_t BLI_task_pool_tasks_done(TaskPool *pool);
-
 /* Parallel for routines */
 typedef void (*TaskParallelRangeFunc)(void *userdata, const int iter);
 typedef void (*TaskParallelRangeFuncEx)(void *userdata, void *userdata_chunk, const int iter, const int thread_id);
diff --git a/source/blender/blenlib/intern/BLI_kdopbvh.c b/source/blender/blenlib/intern/BLI_kdopbvh.c
index b14007a88cb..19d9711922e 100644
--- a/source/blender/blenlib/intern/BLI_kdopbvh.c
+++ b/source/blender/blenlib/intern/BLI_kdopbvh.c
@@ -159,29 +159,6 @@ typedef struct BVHRayCastData {
 	BVHTreeRayHit hit;
 } BVHRayCastData;
 
-typedef struct BVHNearestRayData {
-	BVHTree *tree;
-	BVHTree_NearestToRayCallback callback;
-	void    *userdata;
-
-	struct {
-		bool sign[3];
-		float origin[3];
-		float direction[3];
-
-		float direction_scaled_square[3];
-		float inv_dir[3];
-
-		float cdot_axis[3];
-	} ray;
-
-	bool pick_smallest[3];
-
-	BVHTreeNearest nearest;
-
-	float scale[3];
-} BVHNearestRayData;
-
 /** \} */
 
 
@@ -1900,453 +1877,6 @@ void BLI_bvhtree_ray_cast_all(
 
 /* -------------------------------------------------------------------- */
 
-/** \name BLI_bvhtree_find_nearest_to_ray functions
- *
- * \{ */
-
-static void dist_squared_ray_to_aabb_scaled_v3_precalc(
-        BVHNearestRayData *data,
-        const float ray_origin[3], const float ray_direction[3],
-        const bool ray_is_normalized, const float scale[3])
-{
-	if (scale) {
-		copy_v3_v3(data->scale, scale);
-	}
-	else {
-		copy_v3_fl(data->scale, 1.0f);
-	}
-	/* un-normalize ray */
-	if (ray_is_normalized && scale &&
-	    (data->scale[0] != 1.0f || data->scale[1] != 1.0f || data->scale[2] != 1.0f))
-	{
-		data->ray.direction[0] = ray_direction[0] * data->scale[0];
-		data->ray.direction[1] = ray_direction[1] * data->scale[1];
-		data->ray.direction[2] = ray_direction[2] * data->scale[2];
-
-		mul_v3_v3fl(data->ray.direction, ray_direction, 1 / len_v3(data->ray.direction));
-	}
-	else {
-		copy_v3_v3(data->ray.direction, ray_direction);
-	}
-
-	float dir_sq[3];
-
-	for (int i = 0; i < 3; i++) {
-		data->ray.origin[i] = ray_origin[i];
-		data->ray.inv_dir[i] = (data->ray.direction[i] != 0.0f) ?
-		                       (1.0f / data->ray.direction[i]) : FLT_MAX;
-		/* It has to be in function of `ray.inv_dir`,
-		 * since the division of 1 by 0.0f, can be -inf or +inf */
-		data->ray.sign[i] = (data->ray.inv_dir[i] < 0.0f);
-
-		data->ray.direction_scaled_square[i] = data->ray.direction[i] * data->scale[i];
-
-		dir_sq[i] = SQUARE(data->ray.direction_scaled_square[i]);
-
-		data->ray.direction_scaled_square[i] *= data->scale[i];
-	}
-
-	/* `diag_sq` Length square of each face diagonal */
-	float diag_sq[3] = {
-		dir_sq[1] + dir_sq[2],
-		dir_sq[0] + dir_sq[2],
-		dir_sq[0] + dir_sq[1],
-	};
-
-	data->ray.cdot_axis[0] = (diag_sq[0] != 0.0f) ? data->ray.direction[0] / diag_sq[0] : FLT_MAX;
-	data->ray.cdot_axis[1] = (diag_sq[1] != 0.0f) ? data->ray.direction[1] / diag_sq[1] : FLT_MAX;
-	data->ray.cdot_axis[2] = (diag_sq[2] != 0.0f) ? data->ray.direction[2] / diag_sq[2] : FLT_MAX;
-}
-
-/**
- * Returns the squared distance from a ray to a bound-box `AABB`.
- * It is based on `fast_ray_nearest_hit` solution to obtain
- * the coordinates of the nearest edge of Bound Box to the ray
- */
-MINLINE float dist_squared_ray_to_aabb_scaled_v3__impl(
-        const BVHNearestRayData *data,
-        const float bv[6], float *r_depth_sq, bool r_axis_closest[3])
-{
-
-	/* `tmin` is a vector that has the smaller distances to each of the
-	 * infinite planes of the `AABB` faces (hit in nearest face X plane,
-	 * nearest face Y plane and nearest face Z plane) */
-	float local_bvmin[3], local_bvmax[3];
-
-	if (data->ray.sign[0]) {
-		local_bvmin[0] = bv[1];
-		local_bvmax[0] = bv[0];
-	}
-	else {
-		local_bvmin[0] = bv[0];
-		local_bvmax[0] = bv[1];
-	}
-
-	if (data->ray.sign[1]) {
-		local_bvmin[1] = bv[3];
-		local_bvmax[1] = bv[2];
-	}
-	else {
-		local_bvmin[1] = bv[2];
-		local_bvmax[1] = bv[3];
-	}
-
-	if (data->ray.sign[2]) {
-		local_bvmin[2] = bv[5];
-		local_bvmax[2] = bv[4];
-	}
-	else {
-		local_bvmin[2] = bv[4];
-		local_bvmax[2] = bv[5];
-	}
-
-	sub_v3_v3(local_bvmin, data->ray.origin);
-	sub_v3_v3(local_bvmax, data->ray.origin);
-
-	const float tmin[3] = {
-		local_bvmin[0] * data->ray.inv_dir[0],
-		local_bvmin[1] * data->ray.inv_dir[1],
-		local_bvmin[2] * data->ray.inv_dir[2],
-	};
-
-	/* `tmax` is a vector that has the longer distances to each of the
-	 * infinite planes of the `AABB` faces (hit in farthest face X plane,
-	 * farthest face Y plane and farthest face Z plane) */
-	const float tmax[3] = {
-		local_bvmax[0] * data->ray.inv_dir[0],
-		local_bvmax[1] * data->ray.inv_dir[1],
-		local_bvmax[2] * data->ray.inv_dir[2],
-	};
-	/* `v1` and `v3` is be the coordinates of the nearest `AABB` edge to the ray*/
-	float v1[3], v2[3];
-	/* `rtmin` is the highest value of the smaller distances. == max_axis_v3(tmin)
-	 * `rtmax` is the lowest value of longer distances. == min_axis_v3(tmax)*/
-	float rtmin, rtmax, mul;
-	/* `main_axis` is the axis equivalent to edge close to the ray */
-	int main_axis;
-
-	r_axis_closest[0] = false;
-	r_axis_closest[1] = false;
-	r_axis_closest[2] = false;
-
-	/* *** min_axis_v3(tmax) *** */
-	if ((tmax[0] <= tmax[1]) && (tmax[0] <= tmax[2])) {
-		// printf("# Hit in X %s\n", data->sign[0] ? "min", "max");
-		rtmax = tmax[0];
-		v1[0] = v2[0] = local_bvmax[0];
-		mul = local_bvmax[0] * data->ray.direction_scaled_square[0];
-		main_axis = 3;
-		r_axis_closest[0] = data->ray.sign[0];
-	}
-	else if ((tmax[1] <= tmax[0]) && (tmax[1] <= tmax[2])) {
-		// printf("# Hit in Y %s\n", data->sign[1] ? "min", "max");
-		rtmax = tmax[1];
-		v1[1] = v2[1] = local_bvmax[1];
-		mul = local_bvmax[1] * data->ray.direction_scaled_square[1];
-		main_axis = 2;
-		r_axis_closest[1] = data->ray.sign[1];
-	}
-	else {
-		// printf("# Hit in Z %s\n", data->sign[2] ? "min", "max");
-		rtmax = tmax[2];
-		v1[2] = v2[2] = local_bvmax[2];
-		mul = local_bvmax[2] * data->ray.direction_scaled_square[2];
-		main_axis = 1;
-		r_axis_closest[2] = data->ray.sign[2];
-	}
-
-	/* *** max_axis_v3(tmin) *** */
-	if ((tmin[0] >= tmin[1]) && (tmin[0] >= tmin[2])) {
-		// printf("# To X %s\n", data->sign[0] ? "max", "min");
-		rtmin = tmin[0];
-		v1[0] = v2[0] = local_bvmin[0];
-		mul += local_bvmin[0] * data->ray.direction_scaled_square[0];
-		main_axis -= 3;
-		r_axis_closest[0] = !data->ray.sign[0];
-	}
-	else if ((tmin[1] >= tmin[0]) && (tmin[1] >= tmin[2])) {
-		// printf("# To Y %s\n", data->sign[1] ? "max", "min");
-		rtmin = tmin[1];
-		v1[1] = v2[1] = local_bvmin[1];
-		mul += local_bvmin[1] * data->ray.direction_scaled_square[1];
-		main_axis -= 1;
-		r_axis_closest[1] = !data->ray.sign[1];
-	}
-	else {
-		// printf("# To Z %s\n", data->sign[2] ? "max", "min");
-		rtmin = tmin[2];
-		v1[2] = v2[2] = local_bvmin[2];
-		mul += local_bvmin[2] * data->ray.direction_scaled_square[2];
-		main_axis -= 2;
-		r_axis_closest[2] = !data->ray.sign[2];
-	}
-	/* *** end min/max axis *** */
-
-	if (main_axis < 0)
-		main_axis += 3;
-
-	/* if rtmin < rtmax, ray intersect `AABB` */
-	if (rtmin <= rtmax) {
-#ifdef IGNORE_BEHIND_RAY
-		/* `if rtmax < depth_min`, the whole `AABB` is behind us */
-		if (rtmax < min_depth) {
-			return fallback;
-		}
-#endif
-		const float proj = rtmin * data->ray.direction[main_axis];
-
-		if (data->ray.sign[main_axis])
-			r_axis_closest[main_axis] = (proj - local_bvmax[main_axis]) < (local_bvmin[main_axis] - proj);
-		else
-			r_axis_closest[main_axis] = (proj - local_bvmin[main_axis]) < (local_bvmax[main_axis] - proj);
-
-		//if (r_depth_sq)
-		//	*r_depth_sq = SQUARE(rtmin);
-
-		return 0.0f;
-	}
-#ifdef IGNORE_BEHIND_RAY
-	/* `if rtmin < depth_min`, the whole `AABB` is behing us */
-	else if (rtmin < min_depth) {
-		return fallback;
-	}
-#endif
-
-	if (data->ray.sign[main_axis]) {
-		v1[main_axis] = local_bvmax[main_axis];
-		v2[main_axis] = local_bvmin[main_axis];
-	}
-	else {
-		v1[main_axis] = local_bvmin[main_axis];
-		v2[main_axis] = local_bvmax[main_axis];
-	}
-	{
-		/* `proj` equals to nearest point on the ray closest to the edge `v1 v2` of the `AABB`. */
-		const float proj = mul * data->ray.cdot_axis[main_axis];
-		float depth_sq, r_point[3];
-		if (v1[main_axis] > proj) { /* the nearest point to the ray is the point v1 */
-			r_axis_closest[main_axis] = true;
-			/* `depth` is equivalent the distance of the the projection of v1 on the ray */
-			depth_sq = mul + data->ray.direction_scaled_square[main_axis] * v1[main_axis];
-
-			copy_v3_v3(r_point, v1);
-		}
-		else if (v2[main_axis] < proj) { /* the nearest point of the ray is the point v2 */
-			r_axis_closest[main_axis] = false;
-
-			depth_sq = mul + data->ray.direction_scaled_square[main_axis] * v2[main_axis];
-
-			copy_v3_v3(r_point, v2);
-		}
-		else {  /* the nearest point of the ray is on the edge of the `AABB`. */
-			r_axis_closest[main_axis] = (proj - v1[main_axis]) < (v2[main_axis] - proj);
-
-			depth_sq = mul + data->ray.direction_scaled_square[main_axis] * proj;
-#if 0
-			r_point[0] = main_axis == 0 ? proj : v2[0];
-			r_point[1] = main_axis == 1 ? proj : v2[1];
-			r_point[2] = main_axis == 2 ? proj : v2[2];
-#else
-			v2[main_axis] = proj;
-			copy_v3_v3(r_point, v2);
-#endif
-		}
-		depth_sq *= depth_sq;
-
-		if (r_depth_sq)
-			*r_depth_sq = depth_sq;
-
-		/* TODO: scale can be optional */
-		r_point[0] *= data->scale[0];
-		r_point[1] *= data->scale[1];
-		r_point[2] *= data->scale[2];
-
-		return len_squared_v3(r_point) - depth_sq;
-	}
-}
-
-/**
- * <pre>
- *  + r_point
- *  |
- *  | dist
- *  |
- *  +----depth----+orig <-- dir
- *
- * tangent = dist/depth
- * </pre>
- */
-static float calc_tangent_sq(BVHNearestRayData *data, BVHNode *node)
-{
-	float depth_sq;
-	const float dist_sq = dist_squared_ray_to_aabb_scaled_v3__impl(
-	        data, node->bv, &depth_sq, data->pick_smallest);
-
-	return (dist_sq != 0.0f) ? (dist_sq / depth_sq) : 0.0f;
-}
-
-static float calc_dist_sq_to_ray(BVHNearestRayData *data, BVHNode *node)
-{
-	return dist_squared_ray_to_aabb_scaled_v3__impl(
-	        data, node->bv, NULL,
-	        data->pick_smallest);
-}
-
-static void dfs_find_lowest_tangent_dfs(BVHNearestRayData *data, BVHNode *node)
-{
-	if (node->totnode == 0) {
-		if (data->callback) {
-			data->callback(data->userdata, data->ray.origin, data->ray.direction,
-			               data->scale, node->index, &data->nearest);
-		}
-		else {
-			data->nearest.index = node->index;
-			data->nearest.dist_sq = calc_tangent_sq(data, node);
-			/* TODO: return a value to the data->nearest.co
-			 * not urgent however since users currently define own callbacks */
-		}
-	}
-	else {
-		int i;
-		/* First pick the closest node to dive on */
-		if (data->pick_smallest[node->main_axis]) {
-			for (i = 0; i != node->totnode; i++) {
-				if (calc_tangent_sq(data, node->children[i]) < data->nearest.dist_sq) {
-					dfs_find_lowest_tangent_dfs(data, node->children[i]);
-				}
-			}
-		}
-		else {
-			for (i = node->totnode - 1; i >= 0; i--) {
-				if (calc_tangent_sq(data, node->children[i]) < data->nearest.dist_sq) {
-					dfs_find_lowest_tangent_dfs(data, node->children[i]);
-				}
-			}
-		}
-	}
-}
-
-static void dfs_find_nearest_to_ray_dfs(BVHNearestRayData *data, BVHNode *node)
-{
-	if (node->totnode == 0) {
-		if (data->callback) {
-			data->callback(data->userdata, data->ray.origin, data->ray.direction,
-			               data->scale, node->index, &data->nearest);
-		}
-		else {
-			data->nearest.index = node->index;
-			data->nearest.dist_sq = calc_dist_sq_to_ray(data, node);
-			/* TODO: return a value to the data->nearest.co
-			 * not urgent however since users currently define own callbacks */
-		}
-	}
-	else {
-		int i;
-		/* First pick the closest node to dive on */
-		if (data->pick_smallest[node->main_axis]) {
-			for (i = 0; i != node->totnode; i++) {
-				if (calc_dist_sq_to_ray(data, node->children[i]) < data->nearest.dist_sq) {
-					dfs_find_nearest_to_ray_dfs(data, node->children[i]);
-				}
-			}
-		}
-		else {
-			for (i = node->totnode - 1; i >= 0; i--) {
-				if (calc_dist_sq_to_ray(data, node->children[i]) < data->nearest.dist_sq) {
-					dfs_find_nearest_to_ray_dfs(data, node->children[i]);
-				}
-			}
-		}
-	}
-}
-
-/**
- * Returns the point whose tangent defined by the angle between the point and ray is the lowest
- * nearest.dist_sq returns the angle's tangent
- */
-int BLI_bvhtree_find_nearest_to_ray_angle(
-        BVHTree *tree, const float co[3], const float dir[3],
-        const bool ray_is_normalized, const float scale[3],
-        BVHTreeNearest *nearest,
-        BVHTree_NearestToRayCallback callback, void *userdata)
-{
-	BVHNearestRayData data;
-	BVHNode *root = tree->nodes[tree->totleaf];
-
-	data.tree = tree;
-
-	data.callback = callback;
-	data.userdata = userdata;
-
-	dist_squared_ray_to_aabb_scaled_v3_precalc(&data, co, dir, ray_is_normalized, scale);
-
-	if (nearest) {
-		memcpy(&data.nearest, nearest, sizeof(*nearest));
-	}
-	else {
-		data.nearest.index = -1;
-		data.nearest.dist_sq = FLT_MAX;
-	}
-
-	/* dfs search */
-	if (root) {
-		if (calc_tangent_sq(&data, root) < data.nearest.dist_sq)
-			dfs_find_lowest_tangent_dfs(&data, root);
-	}
-
-	/* copy back results */
-	if (nearest) {
-		memcpy(nearest, &data.nearest, sizeof(*nearest));
-	}
-
-	return data.nearest.index;
-}
-
-/* return the nearest point to ray */
-int BLI_bvhtree_find_nearest_to_ray(
-        BVHTree *tree, const float co[3], const float dir[3],
-        const bool ray_is_normalized, const float scale[3],
-        BVHTreeNearest *nearest,
-        BVHTree_NearestToRayCallback callback, void *userdata)
-{
-	BVHNearestRayData data;
-	BVHNode *root = tree->nodes[tree->totleaf];
-
-	data.tree = tree;
-
-	data.callback = callback;
-	data.userdata = userdata;
-
-	dist_squared_ray_to_aabb_scaled_v3_precalc(&data, co, dir, ray_is_normalized, scale);
-
-	if (nearest) {
-		memcpy(&data.nearest, nearest, sizeof(*nearest));
-	}
-	else {
-		data.nearest.index = -1;
-		data.nearest.dist_sq = FLT_MAX;
-	}
-
-	/* dfs search */
-	if (root) {
-		if (calc_dist_sq_to_ray(&data, root) < data.nearest.dist_sq) {
-			dfs_find_nearest_to_ray_dfs(&data, root);
-		}
-	}
-
-	/* copy back results */
-	if (nearest) {
-		memcpy(nearest, &data.nearest, sizeof(*nearest));
-	}
-
-	return data.nearest.index;
-}
-
-/** \} */
-
-
-/* -------------------------------------------------------------------- */
-
 /** \name BLI_bvhtree_range_query
  *
  * Allocs and fills an array with the indexs of node that are on the given spherical range (center, radius).
diff --git a/source/blender/blenlib/intern/array_store.c b/source/blender/blenlib/intern/array_store.c
index 21ddddad32e..295b39c1a2f 100644
--- a/source/blender/blenlib/intern/array_store.c
+++ b/source/blender/blenlib/intern/array_store.c
@@ -217,15 +217,12 @@
 /** \name Internal Structs
  * \{ */
 
-typedef unsigned int  uint;
-typedef unsigned char ubyte;
-
 typedef uint64_t hash_key;
 
 
 typedef struct BArrayInfo {
 	size_t chunk_stride;
-	uint chunk_count;
+	// uint chunk_count;  /* UNUSED (other values are derived from this) */
 
 	/* pre-calculated */
 	size_t chunk_byte_size;
@@ -291,7 +288,7 @@ typedef struct BChunkList {
 
 /* a chunk of an array */
 typedef struct BChunk {
-	const ubyte *data;
+	const uchar *data;
 	size_t       data_len;
 	/** number of #BChunkList using this. */
 	int          users;
@@ -332,7 +329,7 @@ static size_t bchunk_list_size(const BChunkList *chunk_list);
  * \{ */
 
 static BChunk *bchunk_new(
-        BArrayMemory *bs_mem, const ubyte *data, const size_t data_len)
+        BArrayMemory *bs_mem, const uchar *data, const size_t data_len)
 {
 	BChunk *chunk = BLI_mempool_alloc(bs_mem->chunk);
 	chunk->data     = data;
@@ -345,9 +342,9 @@ static BChunk *bchunk_new(
 }
 
 static BChunk *bchunk_new_copydata(
-        BArrayMemory *bs_mem, const ubyte *data, const size_t data_len)
+        BArrayMemory *bs_mem, const uchar *data, const size_t data_len)
 {
-	ubyte *data_copy = MEM_mallocN(data_len, __func__);
+	uchar *data_copy = MEM_mallocN(data_len, __func__);
 	memcpy(data_copy, data, data_len);
 	return bchunk_new(bs_mem, data_copy, data_len);
 }
@@ -367,7 +364,7 @@ static void bchunk_decref(
 
 static bool bchunk_data_compare(
         const BChunk *chunk,
-        const ubyte *data_base, const size_t data_base_len,
+        const uchar *data_base, const size_t data_base_len,
         const size_t offset)
 {
 	if (offset + (size_t)chunk->data_len <= data_base_len) {
@@ -426,14 +423,14 @@ static void bchunk_list_decref(
 
 #ifdef USE_VALIDATE_LIST_DATA_PARTIAL
 static size_t bchunk_list_data_check(
-        const BChunkList *chunk_list, const ubyte *data)
+        const BChunkList *chunk_list, const uchar *data)
 {
-	size_t total_size = 0;
+	size_t offset = 0;
 	for (BChunkRef *cref = chunk_list->chunk_refs.first; cref; cref = cref->next) {
-		if (memcmp(&data[total_size], cref->link->data, cref->link->data_len) != 0) {
+		if (memcmp(&data[offset], cref->link->data, cref->link->data_len) != 0) {
 			return false;
 		}
-		total_size += cref->link->data_len;
+		offset += cref->link->data_len;
 	}
 	return true;
 }
@@ -466,7 +463,7 @@ static void bchunk_list_ensure_min_size_last(
 				chunk_list->chunk_refs.last = cref->prev;
 				chunk_list->chunk_refs_len -= 1;
 
-				ubyte *data_merge   = MEM_mallocN(data_merge_len, __func__);
+				uchar *data_merge   = MEM_mallocN(data_merge_len, __func__);
 				memcpy(data_merge,                        chunk_prev->data, chunk_prev->data_len);
 				memcpy(&data_merge[chunk_prev->data_len], chunk_curr->data, chunk_curr->data_len);
 
@@ -487,8 +484,8 @@ static void bchunk_list_ensure_min_size_last(
 				/* merge and split */
 				const size_t data_prev_len = split;
 				const size_t data_curr_len = data_merge_len - split;
-				ubyte *data_prev = MEM_mallocN(data_prev_len, __func__);
-				ubyte *data_curr = MEM_mallocN(data_curr_len, __func__);
+				uchar *data_prev = MEM_mallocN(data_prev_len, __func__);
+				uchar *data_curr = MEM_mallocN(data_curr_len, __func__);
 
 				if (data_prev_len <= chunk_prev->data_len) {
 					const size_t data_curr_shrink_len = chunk_prev->data_len - data_prev_len;
@@ -597,11 +594,10 @@ static void bchunk_list_append_only(
 static void bchunk_list_append_data(
         const BArrayInfo *info, BArrayMemory *bs_mem,
         BChunkList *chunk_list,
-        const ubyte *data, const size_t data_len)
+        const uchar *data, const size_t data_len)
 {
 	BLI_assert(data_len != 0);
 
-	// printf("data_len: %d\n", data_len);
 #ifdef USE_MERGE_CHUNKS
 	BLI_assert(data_len <= info->chunk_byte_size_max);
 
@@ -613,13 +609,13 @@ static void bchunk_list_append_data(
 			const size_t data_merge_len = chunk_prev->data_len + data_len;
 			/* realloc for single user */
 			if (cref->link->users == 1) {
-				ubyte *data_merge = MEM_reallocN((void *)cref->link->data, data_merge_len);
+				uchar *data_merge = MEM_reallocN((void *)cref->link->data, data_merge_len);
 				memcpy(&data_merge[chunk_prev->data_len], data, data_len);
 				cref->link->data     = data_merge;
 				cref->link->data_len = data_merge_len;
 			}
 			else {
-				ubyte *data_merge = MEM_mallocN(data_merge_len, __func__);
+				uchar *data_merge = MEM_mallocN(data_merge_len, __func__);
 				memcpy(data_merge, chunk_prev->data, chunk_prev->data_len);
 				memcpy(&data_merge[chunk_prev->data_len], data, data_len);
 				cref->link = bchunk_new(bs_mem, data_merge, data_merge_len);
@@ -639,7 +635,7 @@ static void bchunk_list_append_data(
 	/* don't run this, instead preemptively avoid creating a chunk only to merge it (above). */
 #if 0
 #ifdef USE_MERGE_CHUNKS
-	bchunk_list_ensure_min_size_last(info, bs_mem, chunk_list, chunk_size_min);
+	bchunk_list_ensure_min_size_last(info, bs_mem, chunk_list);
 #endif
 #endif
 }
@@ -654,7 +650,7 @@ static void bchunk_list_append_data(
 static void bchunk_list_append_data_n(
         const BArrayInfo *info, BArrayMemory *bs_mem,
         BChunkList *chunk_list,
-        const ubyte *data, size_t data_len)
+        const uchar *data, size_t data_len)
 {
 	size_t data_trim_len, data_last_chunk_len;
 	bchunk_list_calc_trim_len(info, data_len, &data_trim_len, &data_last_chunk_len);
@@ -714,7 +710,7 @@ static void bchunk_list_append(
 static void bchunk_list_fill_from_array(
         const BArrayInfo *info, BArrayMemory *bs_mem,
         BChunkList *chunk_list,
-        const ubyte *data,
+        const uchar *data,
         const size_t data_len)
 {
 	BLI_assert(BLI_listbase_is_empty(&chunk_list->chunk_refs));
@@ -765,13 +761,13 @@ static void bchunk_list_fill_from_array(
 
 #define HASH_INIT (5381)
 
-BLI_INLINE uint hash_data_single(const ubyte p)
+BLI_INLINE uint hash_data_single(const uchar p)
 {
 	return (HASH_INIT << 5) + HASH_INIT + (unsigned int)p;
 }
 
 /* hash bytes, from BLI_ghashutil_strhash_n */
-static uint hash_data(const ubyte *key, size_t n)
+static uint hash_data(const uchar *key, size_t n)
 {
 	const signed char *p;
 	unsigned int h = HASH_INIT;
@@ -788,7 +784,7 @@ static uint hash_data(const ubyte *key, size_t n)
 
 #ifdef USE_HASH_TABLE_ACCUMULATE
 static void hash_array_from_data(
-        const BArrayInfo *info, const ubyte *data_slice, const size_t data_slice_len,
+        const BArrayInfo *info, const uchar *data_slice, const size_t data_slice_len,
         hash_key *hash_array)
 {
 	if (info->chunk_stride != 1) {
@@ -877,7 +873,7 @@ static void hash_accum_single(hash_key *hash_array, const size_t hash_array_len,
 
 static hash_key key_from_chunk_ref(
         const BArrayInfo *info, const BChunkRef *cref,
-        /* avoid reallicating each time */
+        /* avoid reallocating each time */
         hash_key *hash_store, const size_t hash_store_len)
 {
 	/* in C, will fill in a reusable array */
@@ -899,7 +895,7 @@ static hash_key key_from_chunk_ref(
 			key = hash_store[0];
 
 			/* cache the key */
-			if (key == HASH_TABLE_KEY_UNSET) {
+			if (UNLIKELY(key == HASH_TABLE_KEY_UNSET)) {
 				key = HASH_TABLE_KEY_FALLBACK;
 			}
 			chunk->key = key;
@@ -929,12 +925,12 @@ static hash_key key_from_chunk_ref(
 
 static const BChunkRef *table_lookup(
         const BArrayInfo *info, BTableRef **table, const size_t table_len, const size_t i_table_start,
-        const ubyte *data, const size_t data_len, const size_t offset, const hash_key *table_hash_array)
+        const uchar *data, const size_t data_len, const size_t offset, const hash_key *table_hash_array)
 {
 	size_t size_left = data_len - offset;
 	hash_key key = table_hash_array[((offset - i_table_start) / info->chunk_stride)];
 	size_t key_index = (size_t)(key % (hash_key)table_len);
-	for (BTableRef *tref = table[key_index]; tref; tref = tref->next) {
+	for (const BTableRef *tref = table[key_index]; tref; tref = tref->next) {
 		const BChunkRef *cref = tref->cref;
 #ifdef USE_HASH_TABLE_KEY_CACHE
 		if (cref->link->key == key)
@@ -985,7 +981,7 @@ static hash_key key_from_chunk_ref(const BArrayInfo *info, const BChunkRef *cref
 
 static const BChunkRef *table_lookup(
         const BArrayInfo *info, BTableRef **table, const size_t table_len, const uint UNUSED(i_table_start),
-        const ubyte *data, const size_t data_len, const size_t offset, const hash_key *UNUSED(table_hash_array))
+        const uchar *data, const size_t data_len, const size_t offset, const hash_key *UNUSED(table_hash_array))
 {
 	const size_t data_hash_len = BCHUNK_HASH_LEN * info->chunk_stride;  /* TODO, cache */
 
@@ -1025,7 +1021,7 @@ static const BChunkRef *table_lookup(
  */
 static BChunkList *bchunk_list_from_data_merge(
         const BArrayInfo *info, BArrayMemory *bs_mem,
-        const ubyte *data, const size_t data_len_original,
+        const uchar *data, const size_t data_len_original,
         const BChunkList *chunk_list_reference)
 {
 	ASSERT_CHUNKLIST_SIZE(chunk_list_reference, chunk_list_reference->total_size);
@@ -1042,10 +1038,8 @@ static BChunkList *bchunk_list_from_data_merge(
 	size_t i_prev = 0;
 
 #ifdef USE_FASTPATH_CHUNKS_FIRST
-	bool full_match = false;
-
 	{
-		full_match = true;
+		bool full_match = true;
 
 		const BChunkRef *cref = chunk_list_reference->chunk_refs.first;
 		while (i_prev < data_len_original) {
@@ -1433,7 +1427,7 @@ BArrayStore *BLI_array_store_create(
 	BArrayStore *bs = MEM_callocN(sizeof(BArrayStore), __func__);
 
 	bs->info.chunk_stride = stride;
-	bs->info.chunk_count = chunk_count;
+	// bs->info.chunk_count = chunk_count;
 
 	bs->info.chunk_byte_size = chunk_count * stride;
 #ifdef USE_MERGE_CHUNKS
@@ -1579,7 +1573,7 @@ BArrayState *BLI_array_store_state_add(
 	if (state_reference) {
 		chunk_list = bchunk_list_from_data_merge(
 		        &bs->info, &bs->memory,
-		        (const ubyte *)data, data_len,
+		        (const uchar *)data, data_len,
 		        /* re-use reference chunks */
 		        state_reference->chunk_list);
 	}
@@ -1588,7 +1582,7 @@ BArrayState *BLI_array_store_state_add(
 		bchunk_list_fill_from_array(
 		        &bs->info, &bs->memory,
 		        chunk_list,
-		        (const ubyte *)data, data_len);
+		        (const uchar *)data, data_len);
 	}
 
 	chunk_list->users += 1;
@@ -1655,7 +1649,7 @@ void BLI_array_store_state_data_get(
 	BLI_assert(data_test_len == state->chunk_list->total_size);
 #endif
 
-	ubyte *data_step = (ubyte *)data;
+	uchar *data_step = (uchar *)data;
 	for (BChunkRef *cref = state->chunk_list->chunk_refs.first; cref; cref = cref->next) {
 		BLI_assert(cref->link->users > 0);
 		memcpy(data_step, cref->link->data, cref->link->data_len);
diff --git a/source/blender/blenlib/intern/math_geom.c b/source/blender/blenlib/intern/math_geom.c
index aeb6a550cd9..3cf26ccf904 100644
--- a/source/blender/blenlib/intern/math_geom.c
+++ b/source/blender/blenlib/intern/math_geom.c
@@ -2337,224 +2337,6 @@ bool isect_ray_aabb_v3_simple(
 	}
 }
 
-void dist_squared_ray_to_aabb_v3_precalc(
-        struct NearestRayToAABB_Precalc *data,
-        const float ray_origin[3], const float ray_direction[3])
-{
-	float dir_sq[3];
-
-	for (int i = 0; i < 3; i++) {
-		data->ray_origin[i] = ray_origin[i];
-		data->ray_direction[i] = ray_direction[i];
-		data->ray_inv_dir[i] = (data->ray_direction[i] != 0.0f) ? (1.0f / data->ray_direction[i]) : FLT_MAX;
-		/* It has to be a function of `ray_inv_dir`,
-		 * since the division of 1 by 0.0f, can be -inf or +inf */
-		data->sign[i] = (data->ray_inv_dir[i] < 0.0f);
-
-		dir_sq[i] = SQUARE(data->ray_direction[i]);
-	}
-
-	/* `diag_sq` Length square of each face diagonal */
-	float diag_sq[3] = {
-		dir_sq[1] + dir_sq[2],
-		dir_sq[0] + dir_sq[2],
-		dir_sq[0] + dir_sq[1],
-	};
-	data->idiag_sq[0] = (diag_sq[0] > FLT_EPSILON) ? (1.0f / diag_sq[0]) : FLT_MAX;
-	data->idiag_sq[1] = (diag_sq[1] > FLT_EPSILON) ? (1.0f / diag_sq[1]) : FLT_MAX;
-	data->idiag_sq[2] = (diag_sq[2] > FLT_EPSILON) ? (1.0f / diag_sq[2]) : FLT_MAX;
-
-	data->cdot_axis[0] = data->ray_direction[0] * data->idiag_sq[0];
-	data->cdot_axis[1] = data->ray_direction[1] * data->idiag_sq[1];
-	data->cdot_axis[2] = data->ray_direction[2] * data->idiag_sq[2];
-}
-
-/**
- * Returns the squared distance from a ray to a bound-box `AABB`.
- * It is based on `fast_ray_nearest_hit` solution to obtain
- * the coordinates of the nearest edge of Bound Box to the ray
- */
-float dist_squared_ray_to_aabb_v3(
-        const struct NearestRayToAABB_Precalc *data,
-        const float bb_min[3], const float bb_max[3],
-        bool r_axis_closest[3])
-{
-	/* `tmin` is a vector that has the smaller distances to each of the
-	 * infinite planes of the `AABB` faces (hit in nearest face X plane,
-	 * nearest face Y plane and nearest face Z plane) */
-	float local_bvmin[3], local_bvmax[3];
-
-	if (data->sign[0] == 0) {
-		local_bvmin[0] = bb_min[0] - data->ray_origin[0];
-		local_bvmax[0] = bb_max[0] - data->ray_origin[0];
-	}
-	else {
-		local_bvmin[0] = bb_max[0] - data->ray_origin[0];
-		local_bvmax[0] = bb_min[0] - data->ray_origin[0];
-	}
-
-	if (data->sign[1] == 0) {
-		local_bvmin[1] = bb_min[1] - data->ray_origin[1];
-		local_bvmax[1] = bb_max[1] - data->ray_origin[1];
-	}
-	else {
-		local_bvmin[1] = bb_max[1] - data->ray_origin[1];
-		local_bvmax[1] = bb_min[1] - data->ray_origin[1];
-	}
-
-	if (data->sign[2] == 0) {
-		local_bvmin[2] = bb_min[2] - data->ray_origin[2];
-		local_bvmax[2] = bb_max[2] - data->ray_origin[2];
-	}
-	else {
-		local_bvmin[2] = bb_max[2] - data->ray_origin[2];
-		local_bvmax[2] = bb_min[2] - data->ray_origin[2];
-	}
-
-	const float tmin[3] = {
-		local_bvmin[0] * data->ray_inv_dir[0],
-		local_bvmin[1] * data->ray_inv_dir[1],
-		local_bvmin[2] * data->ray_inv_dir[2],
-	};
-
-	/* `tmax` is a vector that has the longer distances to each of the
-	 * infinite planes of the `AABB` faces (hit in farthest face X plane,
-	 * farthest face Y plane and farthest face Z plane) */
-	const float tmax[3] = {
-		local_bvmax[0] * data->ray_inv_dir[0],
-		local_bvmax[1] * data->ray_inv_dir[1],
-		local_bvmax[2] * data->ray_inv_dir[2],
-	};
-	/* `v1` and `v3` is be the coordinates of the nearest `AABB` edge to the ray*/
-	float v1[3], v2[3];
-	/* `rtmin` is the highest value of the smaller distances. == max_axis_v3(tmin)
-	 * `rtmax` is the lowest value of longer distances. == min_axis_v3(tmax)*/
-	float rtmin, rtmax, mul, rdist;
-	/* `main_axis` is the axis equivalent to edge close to the ray */
-	int main_axis;
-
-	r_axis_closest[0] = false;
-	r_axis_closest[1] = false;
-	r_axis_closest[2] = false;
-
-	/* *** min_axis_v3(tmax) *** */
-	if ((tmax[0] <= tmax[1]) && (tmax[0] <= tmax[2])) {
-		// printf("# Hit in X %s\n", data->sign[0] ? "min", "max");
-		rtmax = tmax[0];
-		v1[0] = v2[0] = local_bvmax[0];
-		mul = local_bvmax[0] * data->ray_direction[0];
-		main_axis = 3;
-		r_axis_closest[0] = data->sign[0];
-	}
-	else if ((tmax[1] <= tmax[0]) && (tmax[1] <= tmax[2])) {
-		// printf("# Hit in Y %s\n", data->sign[1] ? "min", "max");
-		rtmax = tmax[1];
-		v1[1] = v2[1] = local_bvmax[1];
-		mul = local_bvmax[1] * data->ray_direction[1];
-		main_axis = 2;
-		r_axis_closest[1] = data->sign[1];
-	}
-	else {
-		// printf("# Hit in Z %s\n", data->sign[2] ? "min", "max");
-		rtmax = tmax[2];
-		v1[2] = v2[2] = local_bvmax[2];
-		mul = local_bvmax[2] * data->ray_direction[2];
-		main_axis = 1;
-		r_axis_closest[2] = data->sign[2];
-	}
-
-	/* *** max_axis_v3(tmin) *** */
-	if ((tmin[0] >= tmin[1]) && (tmin[0] >= tmin[2])) {
-		// printf("# To X %s\n", data->sign[0] ? "max", "min");
-		rtmin = tmin[0];
-		v1[0] = v2[0] = local_bvmin[0];
-		mul += local_bvmin[0] * data->ray_direction[0];
-		main_axis -= 3;
-		r_axis_closest[0] = !data->sign[0];
-	}
-	else if ((tmin[1] >= tmin[0]) && (tmin[1] >= tmin[2])) {
-		// printf("# To Y %s\n", data->sign[1] ? "max", "min");
-		rtmin = tmin[1];
-		v1[1] = v2[1] = local_bvmin[1];
-		mul += local_bvmin[1] * data->ray_direction[1];
-		main_axis -= 1;
-		r_axis_closest[1] = !data->sign[1];
-	}
-	else {
-		// printf("# To Z %s\n", data->sign[2] ? "max", "min");
-		rtmin = tmin[2];
-		v1[2] = v2[2] = local_bvmin[2];
-		mul += local_bvmin[2] * data->ray_direction[2];
-		main_axis -= 2;
-		r_axis_closest[2] = !data->sign[2];
-	}
-	/* *** end min/max axis *** */
-
-
-	/* `if rtmax < 0`, the whole `AABB` is behing us */
-	if ((rtmax < 0.0f) && (rtmin < 0.0f)) {
-		return FLT_MAX;
-	}
-
-	if (main_axis < 0) {
-		main_axis += 3;
-	}
-
-	if (data->sign[main_axis] == 0) {
-		v1[main_axis] = local_bvmin[main_axis];
-		v2[main_axis] = local_bvmax[main_axis];
-	}
-	else {
-		v1[main_axis] = local_bvmax[main_axis];
-		v2[main_axis] = local_bvmin[main_axis];
-	}
-
-	/* if rtmin < rtmax, ray intersect `AABB` */
-	if (rtmin <= rtmax) {
-		const float proj = rtmin * data->ray_direction[main_axis];
-		rdist = 0.0f;
-		r_axis_closest[main_axis] = (proj - v1[main_axis]) < (v2[main_axis] - proj);
-	}
-	else {
-		/* `proj` equals to nearest point on the ray closest to the edge `v1 v2` of the `AABB`. */
-		const float proj = mul * data->cdot_axis[main_axis];
-		float depth;
-		if (v1[main_axis] > proj) {  /* the nearest point to the ray is the point v1 */
-			/* `depth` is equivalent the distance from the origin to the point v1,
-			 * Here's a faster way to calculate the dot product of v1 and ray
-			 * (depth = dot_v3v3(v1, data->ray.direction))*/
-			depth = mul + data->ray_direction[main_axis] * v1[main_axis];
-			rdist = len_squared_v3(v1) - SQUARE(depth);
-			r_axis_closest[main_axis] = true;
-		}
-		else if (v2[main_axis] < proj) {  /* the nearest point of the ray is the point v2 */
-			depth = mul + data->ray_direction[main_axis] * v2[main_axis];
-			rdist = len_squared_v3(v2) - SQUARE(depth);
-			r_axis_closest[main_axis] = false;
-		}
-		else {  /* the nearest point of the ray is on the edge of the `AABB`. */
-			float v[2];
-			mul *= data->idiag_sq[main_axis];
-			if (main_axis == 0) {
-				v[0] = (mul * data->ray_direction[1]) - v1[1];
-				v[1] = (mul * data->ray_direction[2]) - v1[2];
-			}
-			else if (main_axis == 1) {
-				v[0] = (mul * data->ray_direction[0]) - v1[0];
-				v[1] = (mul * data->ray_direction[2]) - v1[2];
-			}
-			else {
-				v[0] = (mul * data->ray_direction[0]) - v1[0];
-				v[1] = (mul * data->ray_direction[1]) - v1[1];
-			}
-			rdist = len_squared_v2(v);
-			r_axis_closest[main_axis] = (proj - v1[main_axis]) < (v2[main_axis] - proj);
-		}
-	}
-
-	return rdist;
-}
-
 /* find closest point to p on line through (l1, l2) and return lambda,
  * where (0 <= lambda <= 1) when cp is in the line segment (l1, l2)
  */
diff --git a/source/blender/blenlib/intern/polyfill2d.c b/source/blender/blenlib/intern/polyfill2d.c
index 8d9881e4539..2969b0eccf4 100644
--- a/source/blender/blenlib/intern/polyfill2d.c
+++ b/source/blender/blenlib/intern/polyfill2d.c
@@ -21,8 +21,15 @@
 /** \file blender/blenlib/intern/polyfill2d.c
  *  \ingroup bli
  *
- * A simple implementation of the ear cutting algorithm
- * to triangulate simple polygons without holes.
+ * An ear clipping algorithm to triangulate single boundary polygons.
+ *
+ * Details:
+ *
+ * - The algorithm guarantees all triangles are assigned (number of coords - 2)
+ *   and that triangles will have non-overlapping indices (even for degenerate geometry).
+ * - Self-intersections are considered degenerate (resulting triangles will overlap).
+ * - While multiple polygons aren't supported, holes can still be defined using *key-holes*
+ *   (where the polygon doubles back on its self with *exactly* matching coordinates).
  *
  * \note
  *
@@ -74,6 +81,12 @@ typedef signed char eSign;
 
 #ifdef USE_KDTREE
 /**
+ * Spatial optimization for point-in-triangle intersection checks.
+ * The simple version of this algorithm is ``O(n^2)`` complexity
+ * (every point needing to check the triangle defined by every other point),
+ * Using a binary-tree reduces the complexity to ``O(n log n)``
+ * plus some overhead of creating the tree.
+ *
  * This is a single purpose KDTree based on BLI_kdtree with some modifications
  * to better suit polyfill2d.
  *
diff --git a/source/blender/blenlib/intern/rct.c b/source/blender/blenlib/intern/rct.c
index ac73a981b45..fd24a00156d 100644
--- a/source/blender/blenlib/intern/rct.c
+++ b/source/blender/blenlib/intern/rct.c
@@ -351,6 +351,22 @@ void BLI_rcti_init(rcti *rect, int xmin, int xmax, int ymin, int ymax)
 	}
 }
 
+void BLI_rctf_init_pt_radius(rctf *rect, const float xy[2], float size)
+{
+	rect->xmin = xy[0] - size;
+	rect->xmax = xy[0] + size;
+	rect->ymin = xy[1] - size;
+	rect->ymax = xy[1] + size;
+}
+
+void BLI_rcti_init_pt_radius(rcti *rect, const int xy[2], int size)
+{
+	rect->xmin = xy[0] - size;
+	rect->xmax = xy[0] + size;
+	rect->ymin = xy[1] - size;
+	rect->ymax = xy[1] + size;
+}
+
 void BLI_rcti_init_minmax(rcti *rect)
 {
 	rect->xmin = rect->ymin = INT_MAX;
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index fc2d9674c2f..49d2ee83a66 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -48,6 +48,32 @@
  */
 #define MEMPOOL_SIZE 256
 
+/* Number of tasks which are pushed directly to local thread queue.
+ *
+ * This allows thread to fetch next task without locking the whole queue.
+ */
+#define LOCALQUEUE_SIZE 1
+
+#ifndef NDEBUG
+#  define ASSERT_THREAD_ID(scheduler, thread_id)                              \
+	do {                                                                      \
+		if (!BLI_thread_is_main()) {                                          \
+			TaskThread *thread = pthread_getspecific(scheduler->tls_id_key);  \
+			if (thread == NULL) {                                             \
+				BLI_assert(thread_id == 0);                                   \
+			}                                                                 \
+			else {                                                            \
+				BLI_assert(thread_id == thread->id);                          \
+			}                                                                 \
+		}                                                                     \
+		else {                                                                \
+			BLI_assert(thread_id == 0);                                       \
+		}                                                                     \
+	} while (false)
+#else
+#  define ASSERT_THREAD_ID(scheduler, thread_id)
+#endif
+
 typedef struct Task {
 	struct Task *next, *prev;
 
@@ -102,13 +128,16 @@ typedef struct TaskMemPoolStats {
 } TaskMemPoolStats;
 #endif
 
+typedef struct TaskThreadLocalStorage {
+	TaskMemPool task_mempool;
+	int num_local_queue;
+	Task *local_queue[LOCALQUEUE_SIZE];
+} TaskThreadLocalStorage;
+
 struct TaskPool {
 	TaskScheduler *scheduler;
 
 	volatile size_t num;
-	volatile size_t done;
-	size_t num_threads;
-	size_t currently_running_tasks;
 	ThreadMutex num_mutex;
 	ThreadCondition num_cond;
 
@@ -116,6 +145,11 @@ struct TaskPool {
 	ThreadMutex user_mutex;
 
 	volatile bool do_cancel;
+	volatile bool do_work;
+
+	volatile bool is_suspended;
+	ListBase suspended_queue;
+	size_t num_suspended;
 
 	/* If set, this pool may never be work_and_wait'ed, which means TaskScheduler
 	 * has to use its special background fallback thread in case we are in
@@ -123,16 +157,10 @@ struct TaskPool {
 	 */
 	bool run_in_background;
 
-	/* This pool is used for caching task pointers for thread id 0.
-	 * This could either point to a global scheduler's task_mempool[0] if the
-	 * pool is handled form the main thread or point to task_mempool_local
-	 * otherwise.
-	 *
-	 * This way we solve possible threading conflicts accessing same global
-	 * memory pool from multiple threads from which wait_work() is called.
+	/* This is a task scheduler's ID of a thread at which pool was constructed.
+	 * It will be used to access task TLS.
 	 */
-	TaskMemPool *task_mempool;
-	TaskMemPool task_mempool_local;
+	int thread_id;
 
 #ifdef DEBUG_STATS
 	TaskMemPoolStats *mempool_stats;
@@ -142,7 +170,6 @@ struct TaskPool {
 struct TaskScheduler {
 	pthread_t *threads;
 	struct TaskThread *task_threads;
-	TaskMemPool *task_mempool;
 	int num_threads;
 	bool background_thread_only;
 
@@ -151,15 +178,19 @@ struct TaskScheduler {
 	ThreadCondition queue_cond;
 
 	volatile bool do_exit;
+
+	/* NOTE: In pthread's TLS we store the whole TaskThread structure. */
+	pthread_key_t tls_id_key;
 };
 
 typedef struct TaskThread {
 	TaskScheduler *scheduler;
 	int id;
+	TaskThreadLocalStorage tls;
 } TaskThread;
 
 /* Helper */
-static void task_data_free(Task *task, const int thread_id)
+BLI_INLINE void task_data_free(Task *task, const int thread_id)
 {
 	if (task->free_taskdata) {
 		if (task->freedata) {
@@ -171,28 +202,42 @@ static void task_data_free(Task *task, const int thread_id)
 	}
 }
 
-BLI_INLINE TaskMemPool *get_task_mempool(TaskPool *pool, const int thread_id)
+BLI_INLINE TaskThreadLocalStorage *get_task_tls(TaskPool *pool,
+                                                const int thread_id)
 {
+	TaskScheduler *scheduler = pool->scheduler;
+	BLI_assert(thread_id >= 0);
+	BLI_assert(thread_id <= scheduler->num_threads);
 	if (thread_id == 0) {
-		return pool->task_mempool;
+		return &scheduler->task_threads[pool->thread_id].tls;
+	}
+	return &scheduler->task_threads[thread_id].tls;
+}
+
+BLI_INLINE void free_task_tls(TaskThreadLocalStorage *tls)
+{
+	TaskMemPool *task_mempool = &tls->task_mempool;
+	for (int i = 0; i < task_mempool->num_tasks; ++i) {
+		MEM_freeN(task_mempool->tasks[i]);
 	}
-	return &pool->scheduler->task_mempool[thread_id];
 }
 
 static Task *task_alloc(TaskPool *pool, const int thread_id)
 {
-	assert(thread_id <= pool->scheduler->num_threads);
+	BLI_assert(thread_id <= pool->scheduler->num_threads);
 	if (thread_id != -1) {
-		assert(thread_id >= 0);
-		TaskMemPool *mem_pool = get_task_mempool(pool, thread_id);
+		BLI_assert(thread_id >= 0);
+		BLI_assert(thread_id <= pool->scheduler->num_threads);
+		TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+		TaskMemPool *task_mempool = &tls->task_mempool;
 		/* Try to re-use task memory from a thread local storage. */
-		if (mem_pool->num_tasks > 0) {
-			--mem_pool->num_tasks;
+		if (task_mempool->num_tasks > 0) {
+			--task_mempool->num_tasks;
 			/* Success! We've just avoided task allocation. */
 #ifdef DEBUG_STATS
 			pool->mempool_stats[thread_id].num_reuse++;
 #endif
-			return mem_pool->tasks[mem_pool->num_tasks];
+			return task_mempool->tasks[task_mempool->num_tasks];
 		}
 		/* We are doomed to allocate new task data. */
 #ifdef DEBUG_STATS
@@ -205,13 +250,14 @@ static Task *task_alloc(TaskPool *pool, const int thread_id)
 static void task_free(TaskPool *pool, Task *task, const int thread_id)
 {
 	task_data_free(task, thread_id);
-	assert(thread_id >= 0);
-	assert(thread_id <= pool->scheduler->num_threads);
-	TaskMemPool *mem_pool = get_task_mempool(pool, thread_id);
-	if (mem_pool->num_tasks < MEMPOOL_SIZE - 1) {
+	BLI_assert(thread_id >= 0);
+	BLI_assert(thread_id <= pool->scheduler->num_threads);
+	TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+	TaskMemPool *task_mempool = &tls->task_mempool;
+	if (task_mempool->num_tasks < MEMPOOL_SIZE - 1) {
 		/* Successfully allowed the task to be re-used later. */
-		mem_pool->tasks[mem_pool->num_tasks] = task;
-		++mem_pool->num_tasks;
+		task_mempool->tasks[task_mempool->num_tasks] = task;
+		++task_mempool->num_tasks;
 	}
 	else {
 		/* Local storage saturated, no other way than just discard
@@ -237,8 +283,6 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done)
 	BLI_assert(pool->num >= done);
 
 	pool->num -= done;
-	atomic_sub_and_fetch_z(&pool->currently_running_tasks, done);
-	pool->done += done;
 
 	if (pool->num == 0)
 		BLI_condition_notify_all(&pool->num_cond);
@@ -246,11 +290,11 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done)
 	BLI_mutex_unlock(&pool->num_mutex);
 }
 
-static void task_pool_num_increase(TaskPool *pool)
+static void task_pool_num_increase(TaskPool *pool, size_t new)
 {
 	BLI_mutex_lock(&pool->num_mutex);
 
-	pool->num++;
+	pool->num += new;
 	BLI_condition_notify_all(&pool->num_cond);
 
 	BLI_mutex_unlock(&pool->num_mutex);
@@ -292,17 +336,10 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task
 				continue;
 			}
 
-			if (atomic_add_and_fetch_z(&pool->currently_running_tasks, 1) <= pool->num_threads ||
-			    pool->num_threads == 0)
-			{
-				*task = current_task;
-				found_task = true;
-				BLI_remlink(&scheduler->queue, *task);
-				break;
-			}
-			else {
-				atomic_sub_and_fetch_z(&pool->currently_running_tasks, 1);
-			}
+			*task = current_task;
+			found_task = true;
+			BLI_remlink(&scheduler->queue, *task);
+			break;
 		}
 		if (!found_task)
 			BLI_condition_wait(&scheduler->queue_cond, &scheduler->queue_mutex);
@@ -313,13 +350,34 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task
 	return true;
 }
 
+BLI_INLINE void handle_local_queue(TaskThreadLocalStorage *tls,
+                                   const int thread_id)
+{
+	while (tls->num_local_queue > 0) {
+		/* We pop task from queue before handling it so handler of the task can
+		 * push next job to the local queue.
+		 */
+		tls->num_local_queue--;
+		Task *local_task = tls->local_queue[tls->num_local_queue];
+		/* TODO(sergey): Double-check work_and_wait() doesn't handle other's
+		 * pool tasks.
+		 */
+		TaskPool *local_pool = local_task->pool;
+		local_task->run(local_pool, local_task->taskdata, thread_id);
+		task_free(local_pool, local_task, thread_id);
+	}
+}
+
 static void *task_scheduler_thread_run(void *thread_p)
 {
 	TaskThread *thread = (TaskThread *) thread_p;
+	TaskThreadLocalStorage *tls = &thread->tls;
 	TaskScheduler *scheduler = thread->scheduler;
 	int thread_id = thread->id;
 	Task *task;
 
+	pthread_setspecific(scheduler->tls_id_key, thread);
+
 	/* keep popping off tasks */
 	while (task_scheduler_thread_wait_pop(scheduler, &task)) {
 		TaskPool *pool = task->pool;
@@ -330,6 +388,9 @@ static void *task_scheduler_thread_run(void *thread_p)
 		/* delete task */
 		task_free(pool, task, thread_id);
 
+		/* Handle all tasks from local queue. */
+		handle_local_queue(tls, thread_id);
+
 		/* notify pool task was done */
 		task_pool_num_decrease(pool, 1);
 	}
@@ -359,20 +420,24 @@ TaskScheduler *BLI_task_scheduler_create(int num_threads)
 
 	/* Add background-only thread if needed. */
 	if (num_threads == 0) {
-	    scheduler->background_thread_only = true;
-	    num_threads = 1;
+		scheduler->background_thread_only = true;
+		num_threads = 1;
 	}
 
+	scheduler->task_threads = MEM_callocN(sizeof(TaskThread) * (num_threads + 1),
+	                                      "TaskScheduler task threads");
+
+	pthread_key_create(&scheduler->tls_id_key, NULL);
+
 	/* launch threads that will be waiting for work */
 	if (num_threads > 0) {
 		int i;
 
 		scheduler->num_threads = num_threads;
 		scheduler->threads = MEM_callocN(sizeof(pthread_t) * num_threads, "TaskScheduler threads");
-		scheduler->task_threads = MEM_callocN(sizeof(TaskThread) * num_threads, "TaskScheduler task threads");
 
 		for (i = 0; i < num_threads; i++) {
-			TaskThread *thread = &scheduler->task_threads[i];
+			TaskThread *thread = &scheduler->task_threads[i + 1];
 			thread->scheduler = scheduler;
 			thread->id = i + 1;
 
@@ -380,9 +445,6 @@ TaskScheduler *BLI_task_scheduler_create(int num_threads)
 				fprintf(stderr, "TaskScheduler failed to launch thread %d/%d\n", i, num_threads);
 			}
 		}
-
-		scheduler->task_mempool = MEM_callocN(sizeof(*scheduler->task_mempool) * (num_threads + 1),
-		                                      "TaskScheduler task_mempool");
 	}
 
 	return scheduler;
@@ -398,6 +460,8 @@ void BLI_task_scheduler_free(TaskScheduler *scheduler)
 	BLI_condition_notify_all(&scheduler->queue_cond);
 	BLI_mutex_unlock(&scheduler->queue_mutex);
 
+	pthread_key_delete(scheduler->tls_id_key);
+
 	/* delete threads */
 	if (scheduler->threads) {
 		int i;
@@ -412,17 +476,12 @@ void BLI_task_scheduler_free(TaskScheduler *scheduler)
 
 	/* Delete task thread data */
 	if (scheduler->task_threads) {
-		MEM_freeN(scheduler->task_threads);
-	}
-
-	/* Delete task memory pool */
-	if (scheduler->task_mempool) {
-		for (int i = 0; i <= scheduler->num_threads; ++i) {
-			for (int j = 0; j < scheduler->task_mempool[i].num_tasks; ++j) {
-				MEM_freeN(scheduler->task_mempool[i].tasks[j]);
-			}
+		for (int i = 0; i < scheduler->num_threads + 1; ++i) {
+			TaskThreadLocalStorage *tls = &scheduler->task_threads[i].tls;
+			free_task_tls(tls);
 		}
-		MEM_freeN(scheduler->task_mempool);
+
+		MEM_freeN(scheduler->task_threads);
 	}
 
 	/* delete leftover tasks */
@@ -445,7 +504,7 @@ int BLI_task_scheduler_num_threads(TaskScheduler *scheduler)
 
 static void task_scheduler_push(TaskScheduler *scheduler, Task *task, TaskPriority priority)
 {
-	task_pool_num_increase(task->pool);
+	task_pool_num_increase(task->pool, 1);
 
 	/* add task to queue */
 	BLI_mutex_lock(&scheduler->queue_mutex);
@@ -471,7 +530,7 @@ static void task_scheduler_clear(TaskScheduler *scheduler, TaskPool *pool)
 		nexttask = task->next;
 
 		if (task->pool == pool) {
-			task_data_free(task, 0);
+			task_data_free(task, pool->thread_id);
 			BLI_freelinkN(&scheduler->queue, task);
 
 			done++;
@@ -486,7 +545,10 @@ static void task_scheduler_clear(TaskScheduler *scheduler, TaskPool *pool)
 
 /* Task Pool */
 
-static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, const bool is_background)
+static TaskPool *task_pool_create_ex(TaskScheduler *scheduler,
+                                     void *userdata,
+                                     const bool is_background,
+                                     const bool is_suspended)
 {
 	TaskPool *pool = MEM_mallocN(sizeof(TaskPool), "TaskPool");
 
@@ -504,10 +566,11 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
 
 	pool->scheduler = scheduler;
 	pool->num = 0;
-	pool->done = 0;
-	pool->num_threads = 0;
-	pool->currently_running_tasks = 0;
 	pool->do_cancel = false;
+	pool->do_work = false;
+	pool->is_suspended = is_suspended;
+	pool->num_suspended = 0;
+	pool->suspended_queue.first = pool->suspended_queue.last = NULL;
 	pool->run_in_background = is_background;
 
 	BLI_mutex_init(&pool->num_mutex);
@@ -517,11 +580,21 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
 	BLI_mutex_init(&pool->user_mutex);
 
 	if (BLI_thread_is_main()) {
-		pool->task_mempool = scheduler->task_mempool;
+		pool->thread_id = 0;
 	}
 	else {
-		pool->task_mempool = &pool->task_mempool_local;
-		pool->task_mempool_local.num_tasks = 0;
+		TaskThread *thread = pthread_getspecific(scheduler->tls_id_key);
+		/* NOTE: It is possible that pool is created from non-main thread
+		 * which isn't a scheduler thread. In this case pthread's TLS will
+		 * be NULL and we can safely consider thread id 0 for the main
+		 * thread of this pool (the one which does wort_and_wait()).
+		 */
+		if (thread == NULL) {
+			pool->thread_id = 0;
+		}
+		else {
+			pool->thread_id = thread->id;
+		}
 	}
 
 #ifdef DEBUG_STATS
@@ -548,7 +621,7 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
  */
 TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata)
 {
-	return task_pool_create_ex(scheduler, userdata, false);
+	return task_pool_create_ex(scheduler, userdata, false, false);
 }
 
 /**
@@ -563,25 +636,28 @@ TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata)
  */
 TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata)
 {
-	return task_pool_create_ex(scheduler, userdata, true);
+	return task_pool_create_ex(scheduler, userdata, true, false);
+}
+
+/**
+ * Similar to BLI_task_pool_create() but does not schedule any tasks for execution
+ * for until BLI_task_pool_work_and_wait() is called. This helps reducing therading
+ * overhead when pushing huge amount of small initial tasks from the main thread.
+ */
+TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata)
+{
+	return task_pool_create_ex(scheduler, userdata, false, true);
 }
 
 void BLI_task_pool_free(TaskPool *pool)
 {
-	BLI_task_pool_stop(pool);
+	BLI_task_pool_cancel(pool);
 
 	BLI_mutex_end(&pool->num_mutex);
 	BLI_condition_end(&pool->num_cond);
 
 	BLI_mutex_end(&pool->user_mutex);
 
-	/* Free local memory pool, those pointers are lost forever. */
-	if (pool->task_mempool == &pool->task_mempool_local) {
-		for (int i = 0; i < pool->task_mempool_local.num_tasks; i++) {
-			MEM_freeN(pool->task_mempool_local.tasks[i]);
-		}
-	}
-
 #ifdef DEBUG_STATS
 	printf("Thread ID    Allocated   Reused   Discarded\n");
 	for (int i = 0; i < pool->scheduler->num_threads + 1; ++i) {
@@ -612,6 +688,25 @@ static void task_pool_push(
 	task->freedata = freedata;
 	task->pool = pool;
 
+	if (pool->is_suspended) {
+		BLI_addhead(&pool->suspended_queue, task);
+		atomic_fetch_and_add_z(&pool->num_suspended, 1);
+		return;
+	}
+
+	if (thread_id != -1 &&
+	    (thread_id != pool->thread_id || pool->do_work))
+	{
+		ASSERT_THREAD_ID(pool->scheduler, thread_id);
+
+		TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+		if (tls->num_local_queue < LOCALQUEUE_SIZE) {
+			tls->local_queue[tls->num_local_queue] = task;
+			tls->num_local_queue++;
+			return;
+		}
+	}
+
 	task_scheduler_push(pool->scheduler, task, priority);
 }
 
@@ -636,8 +731,27 @@ void BLI_task_pool_push_from_thread(TaskPool *pool, TaskRunFunction run,
 
 void BLI_task_pool_work_and_wait(TaskPool *pool)
 {
+	TaskThreadLocalStorage *tls = get_task_tls(pool, pool->thread_id);
 	TaskScheduler *scheduler = pool->scheduler;
 
+	if (atomic_fetch_and_and_uint8((uint8_t*)&pool->is_suspended, 0)) {
+		if (pool->num_suspended) {
+			task_pool_num_increase(pool, pool->num_suspended);
+			BLI_mutex_lock(&scheduler->queue_mutex);
+
+			BLI_movelisttolist(&scheduler->queue, &pool->suspended_queue);
+
+			BLI_condition_notify_all(&scheduler->queue_cond);
+			BLI_mutex_unlock(&scheduler->queue_mutex);
+
+		}
+		pool->is_suspended = false;
+	}
+
+	pool->do_work = true;
+
+	ASSERT_THREAD_ID(pool->scheduler, pool->thread_id);
+
 	BLI_mutex_lock(&pool->num_mutex);
 
 	while (pool->num != 0) {
@@ -651,16 +765,12 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
 		/* find task from this pool. if we get a task from another pool,
 		 * we can get into deadlock */
 
-		if (pool->num_threads == 0 ||
-		    pool->currently_running_tasks < pool->num_threads)
-		{
-			for (task = scheduler->queue.first; task; task = task->next) {
-				if (task->pool == pool) {
-					work_task = task;
-					found_task = true;
-					BLI_remlink(&scheduler->queue, task);
-					break;
-				}
+		for (task = scheduler->queue.first; task; task = task->next) {
+			if (task->pool == pool) {
+				work_task = task;
+				found_task = true;
+				BLI_remlink(&scheduler->queue, task);
+				break;
 			}
 		}
 
@@ -669,11 +779,13 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
 		/* if found task, do it, otherwise wait until other tasks are done */
 		if (found_task) {
 			/* run task */
-			atomic_add_and_fetch_z(&pool->currently_running_tasks, 1);
-			work_task->run(pool, work_task->taskdata, 0);
+			work_task->run(pool, work_task->taskdata, pool->thread_id);
 
 			/* delete task */
-			task_free(pool, task, 0);
+			task_free(pool, task, pool->thread_id);
+
+			/* Handle all tasks from local queue. */
+			handle_local_queue(tls, pool->thread_id);
 
 			/* notify pool task was done */
 			task_pool_num_decrease(pool, 1);
@@ -688,22 +800,8 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
 	}
 
 	BLI_mutex_unlock(&pool->num_mutex);
-}
 
-int BLI_pool_get_num_threads(TaskPool *pool)
-{
-	if (pool->num_threads != 0) {
-		return pool->num_threads;
-	}
-	else {
-		return BLI_task_scheduler_num_threads(pool->scheduler);
-	}
-}
-
-void BLI_pool_set_num_threads(TaskPool *pool, int num_threads)
-{
-	/* NOTE: Don't try to modify threads while tasks are running! */
-	pool->num_threads = num_threads;
+	handle_local_queue(tls, pool->thread_id);
 }
 
 void BLI_task_pool_cancel(TaskPool *pool)
@@ -721,13 +819,6 @@ void BLI_task_pool_cancel(TaskPool *pool)
 	pool->do_cancel = false;
 }
 
-void BLI_task_pool_stop(TaskPool *pool)
-{
-	task_scheduler_clear(pool->scheduler, pool);
-
-	BLI_assert(pool->num == 0);
-}
-
 bool BLI_task_pool_canceled(TaskPool *pool)
 {
 	return pool->do_cancel;
@@ -743,11 +834,6 @@ ThreadMutex *BLI_task_pool_user_mutex(TaskPool *pool)
 	return &pool->user_mutex;
 }
 
-size_t BLI_task_pool_tasks_done(TaskPool *pool)
-{
-	return pool->done;
-}
-
 /* Parallel range routines */
 
 /**
@@ -918,7 +1004,8 @@ static void task_parallel_range_ex(
 		BLI_task_pool_push_from_thread(task_pool,
 		                               parallel_range_func,
 		                               userdata_chunk_local, false,
-		                               TASK_PRIORITY_HIGH, 0);
+		                               TASK_PRIORITY_HIGH,
+		                               task_pool->thread_id);
 	}
 
 	BLI_task_pool_work_and_wait(task_pool);
@@ -1124,7 +1211,8 @@ void BLI_task_parallel_listbase(
 		BLI_task_pool_push_from_thread(task_pool,
 		                               parallel_listbase_func,
 		                               NULL, false,
-		                               TASK_PRIORITY_HIGH, 0);
+		                               TASK_PRIORITY_HIGH,
+		                               task_pool->thread_id);
 	}
 
 	BLI_task_pool_work_and_wait(task_pool);
diff --git a/source/blender/blenlib/intern/threads.c b/source/blender/blenlib/intern/threads.c
index b60981802aa..77da3be0600 100644
--- a/source/blender/blenlib/intern/threads.c
+++ b/source/blender/blenlib/intern/threads.c
@@ -54,6 +54,8 @@
 #  include <sys/time.h>
 #endif
 
+#include "atomic_ops.h"
+
 #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && !defined(__clang__)
 #  define USE_APPLE_OMP_FIX
 #endif
@@ -124,7 +126,7 @@ static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_t mainid;
-static int thread_levels = 0;  /* threads can be invoked inside threads */
+static unsigned int thread_levels = 0;  /* threads can be invoked inside threads */
 static int num_threads_override = 0;
 
 /* just a max for security reasons */
@@ -198,9 +200,9 @@ void BLI_init_threads(ListBase *threadbase, void *(*do_thread)(void *), int tot)
 			tslot->avail = 1;
 		}
 	}
-	
-	BLI_spin_lock(&_malloc_lock);
-	if (thread_levels == 0) {
+
+	unsigned int level = atomic_fetch_and_add_u(&thread_levels, 1);
+	if (level == 0) {
 		MEM_set_lock_callback(BLI_lock_malloc_thread, BLI_unlock_malloc_thread);
 
 #ifdef USE_APPLE_OMP_FIX
@@ -210,9 +212,6 @@ void BLI_init_threads(ListBase *threadbase, void *(*do_thread)(void *), int tot)
 		thread_tls_data = pthread_getspecific(gomp_tls_key);
 #endif
 	}
-
-	thread_levels++;
-	BLI_spin_unlock(&_malloc_lock);
 }
 
 /* amount of available threads */
@@ -331,11 +330,10 @@ void BLI_end_threads(ListBase *threadbase)
 		BLI_freelistN(threadbase);
 	}
 
-	BLI_spin_lock(&_malloc_lock);
-	thread_levels--;
-	if (thread_levels == 0)
+	unsigned int level = atomic_sub_and_fetch_u(&thread_levels, 1);
+	if (level == 0) {
 		MEM_set_lock_callback(NULL, NULL);
-	BLI_spin_unlock(&_malloc_lock);
+	}
 }
 
 /* System Information */
@@ -812,26 +810,17 @@ void BLI_thread_queue_wait_finish(ThreadQueue *queue)
 
 void BLI_begin_threaded_malloc(void)
 {
-	/* Used for debug only */
-	/* BLI_assert(thread_levels >= 0); */
-
-	BLI_spin_lock(&_malloc_lock);
-	if (thread_levels == 0) {
+	unsigned int level = atomic_fetch_and_add_u(&thread_levels, 1);
+	if (level == 0) {
 		MEM_set_lock_callback(BLI_lock_malloc_thread, BLI_unlock_malloc_thread);
 	}
-	thread_levels++;
-	BLI_spin_unlock(&_malloc_lock);
 }
 
 void BLI_end_threaded_malloc(void)
 {
-	/* Used for debug only */
-	/* BLI_assert(thread_levels >= 0); */
-
-	BLI_spin_lock(&_malloc_lock);
-	thread_levels--;
-	if (thread_levels == 0)
+	unsigned int level = atomic_sub_and_fetch_u(&thread_levels, 1);
+	if (level == 0) {
 		MEM_set_lock_callback(NULL, NULL);
-	BLI_spin_unlock(&_malloc_lock);
+	}
 }
 
diff --git a/source/blender/blenloader/intern/readfile.c b/source/blender/blenloader/intern/readfile.c
index 971f5d54b10..e7d3d4c369a 100644
--- a/source/blender/blenloader/intern/readfile.c
+++ b/source/blender/blenloader/intern/readfile.c
@@ -5317,6 +5317,37 @@ static void direct_link_modifiers(FileData *fd, ListBase *lb)
 			MeshSeqCacheModifierData *msmcd = (MeshSeqCacheModifierData *)md;
 			msmcd->reader = NULL;
 		}
+		else if (md->type == eModifierType_SurfaceDeform) {
+			SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+			smd->verts = newdataadr(fd, smd->verts);
+
+			if (smd->verts) {
+				for (int i = 0; i < smd->numverts; i++) {
+					smd->verts[i].binds = newdataadr(fd, smd->verts[i].binds);
+
+					if (smd->verts[i].binds) {
+						for (int j = 0; j < smd->verts[i].numbinds; j++) {
+							smd->verts[i].binds[j].vert_inds = newdataadr(fd, smd->verts[i].binds[j].vert_inds);
+							smd->verts[i].binds[j].vert_weights = newdataadr(fd, smd->verts[i].binds[j].vert_weights);
+
+							if (fd->flags & FD_FLAGS_SWITCH_ENDIAN) {
+								if (smd->verts[i].binds[j].vert_inds)
+									BLI_endian_switch_uint32_array(smd->verts[i].binds[j].vert_inds, smd->verts[i].binds[j].numverts);
+
+								if (smd->verts[i].binds[j].vert_weights) {
+									if (smd->verts[i].binds[j].mode == MOD_SDEF_MODE_CENTROID ||
+									    smd->verts[i].binds[j].mode == MOD_SDEF_MODE_LOOPTRI)
+										BLI_endian_switch_float_array(smd->verts[i].binds[j].vert_weights, 3);
+									else
+										BLI_endian_switch_float_array(smd->verts[i].binds[j].vert_weights, smd->verts[i].binds[j].numverts);
+								}
+							}
+						}
+					}
+				}
+			}
+		}
 	}
 }
 
diff --git a/source/blender/blenloader/intern/versioning_270.c b/source/blender/blenloader/intern/versioning_270.c
index 610c74148eb..8eb61251ddd 100644
--- a/source/blender/blenloader/intern/versioning_270.c
+++ b/source/blender/blenloader/intern/versioning_270.c
@@ -1577,6 +1577,41 @@ void blo_do_versions_270(FileData *fd, Library *UNUSED(lib), Main *main)
 			}
 		}
 
+		/* Fix for T50736, Glare comp node using same var for two different things. */
+		if (!DNA_struct_elem_find(fd->filesdna, "NodeGlare", "char", "star_45")) {
+			FOREACH_NODETREE(main, ntree, id) {
+				if (ntree->type == NTREE_COMPOSIT) {
+					ntreeSetTypes(NULL, ntree);
+					for (bNode *node = ntree->nodes.first; node; node = node->next) {
+						if (node->type == CMP_NODE_GLARE) {
+							NodeGlare *ndg = node->storage;
+							switch (ndg->type) {
+								case 2:  /* Grrrr! magic numbers :( */
+									ndg->streaks = ndg->angle;
+									break;
+								case 0:
+									ndg->star_45 = ndg->angle != 0;
+									break;
+								default:
+									break;
+							}
+						}
+					}
+				}
+			} FOREACH_NODETREE_END
+		}
+
+		if (!DNA_struct_elem_find(fd->filesdna, "SurfaceDeformModifierData", "float", "mat[4][4]")) {
+			for (Object *ob = main->object.first; ob; ob = ob->id.next) {
+				for (ModifierData *md = ob->modifiers.first; md; md = md->next) {
+					if (md->type == eModifierType_SurfaceDeform) {
+						SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+						unit_m4(smd->mat);
+					}
+				}
+			}
+		}
+
 		/* initialize regiondata for each SpaceClip, due to the newly brought RegionSpaceClip */
 		if (!DNA_struct_elem_find(fd->filesdna, "SpaceClip", "MovieClip", "*secondary_clip")) {
 			for (bScreen *screen = main->screen.first; screen != NULL; screen = screen->id.next) {
diff --git a/source/blender/blenloader/intern/writefile.c b/source/blender/blenloader/intern/writefile.c
index 106943d15dc..eef38c479e8 100644
--- a/source/blender/blenloader/intern/writefile.c
+++ b/source/blender/blenloader/intern/writefile.c
@@ -78,7 +78,7 @@
  * - write #TEST (#RenderInfo struct. 128x128 blend file preview is optional).
  * - write #GLOB (#FileGlobal struct) (some global vars).
  * - write #DNA1 (#SDNA struct)
- * - write #USER (#UserDef struct) if filename is ``~/X.XX/config/startup.blend``.
+ * - write #USER (#UserDef struct) if filename is ``~/.config/blender/X.XX/config/startup.blend``.
  */
 
 
@@ -1026,6 +1026,25 @@ static void write_nodetree(WriteData *wd, bNodeTree *ntree)
 			{
 				/* pass */
 			}
+			else if ((ntree->type == NTREE_COMPOSIT) && (node->type == CMP_NODE_GLARE)) {
+				/* Simple forward compat for fix for T50736.
+				 * Not ideal (there is no ideal solution here), but should do for now. */
+				NodeGlare *ndg = node->storage;
+				/* Not in undo case. */
+				if (!wd->current) {
+					switch (ndg->type) {
+						case 2:  /* Grrrr! magic numbers :( */
+							ndg->angle = ndg->streaks;
+							break;
+						case 0:
+							ndg->angle = ndg->star_45;
+							break;
+						default:
+							break;
+					}
+				}
+				writestruct_id(wd, DATA, node->typeinfo->storagename, 1, node->storage);
+			}
 			else {
 				writestruct_id(wd, DATA, node->typeinfo->storagename, 1, node->storage);
 			}
@@ -1818,6 +1837,32 @@ static void write_modifiers(WriteData *wd, ListBase *modbase)
 				writedata(wd, DATA, sizeof(float[3]) * csmd->bind_coords_num, csmd->bind_coords);
 			}
 		}
+		else if (md->type == eModifierType_SurfaceDeform) {
+			SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+			writestruct(wd, DATA, SDefVert, smd->numverts, smd->verts);
+
+			if (smd->verts) {
+				for (int i = 0; i < smd->numverts; i++) {
+					writestruct(wd, DATA, SDefBind, smd->verts[i].numbinds, smd->verts[i].binds);
+
+					if (smd->verts[i].binds) {
+						for (int j = 0; j < smd->verts[i].numbinds; j++) {
+							writedata(wd, DATA, sizeof(int) * smd->verts[i].binds[j].numverts, smd->verts[i].binds[j].vert_inds);
+
+							if (smd->verts[i].binds[j].mode == MOD_SDEF_MODE_CENTROID ||
+							    smd->verts[i].binds[j].mode == MOD_SDEF_MODE_LOOPTRI)
+							{
+								writedata(wd, DATA, sizeof(float) * 3, smd->verts[i].binds[j].vert_weights);
+							}
+							else {
+								writedata(wd, DATA, sizeof(float) * smd->verts[i].binds[j].numverts, smd->verts[i].binds[j].vert_weights);
+							}
+						}
+					}
+				}
+			}
+		}
 	}
 }
 
diff --git a/source/blender/bmesh/operators/bmo_primitive.c b/source/blender/bmesh/operators/bmo_primitive.c
index 8408169d85e..723e0b168e0 100644
--- a/source/blender/bmesh/operators/bmo_primitive.c
+++ b/source/blender/bmesh/operators/bmo_primitive.c
@@ -1122,7 +1122,7 @@ static void bm_mesh_calc_uvs_sphere_face(BMFace *f, const int cd_loop_uv_offset)
 		}
 
 		/* Shift borderline coordinates to the left. */
-		if (fabsf(theta - M_PI) < 0.0001f) {
+		if (fabsf(theta - (float)M_PI) < 0.0001f) {
 			theta = -M_PI;
 		}
 
diff --git a/source/blender/bmesh/tools/bmesh_intersect.c b/source/blender/bmesh/tools/bmesh_intersect.c
index 58234ddf3bd..2cb82d0fc02 100644
--- a/source/blender/bmesh/tools/bmesh_intersect.c
+++ b/source/blender/bmesh/tools/bmesh_intersect.c
@@ -986,7 +986,7 @@ bool BM_mesh_intersect(
         struct BMLoop *(*looptris)[3], const int looptris_tot,
         int (*test_fn)(BMFace *f, void *user_data), void *user_data,
         const bool use_self, const bool use_separate, const bool use_dissolve, const bool use_island_connect,
-        const int boolean_mode,
+        const bool use_edge_tag, const int boolean_mode,
         const float eps)
 {
 	struct ISectState s;
@@ -1526,7 +1526,7 @@ bool BM_mesh_intersect(
 
 		BM_mesh_edgesplit(bm, false, true, false);
 	}
-	else if (boolean_mode != BMESH_ISECT_BOOLEAN_NONE) {
+	else if (boolean_mode != BMESH_ISECT_BOOLEAN_NONE || use_edge_tag) {
 		GSetIterator gs_iter;
 
 		/* no need to clear for boolean */
diff --git a/source/blender/bmesh/tools/bmesh_intersect.h b/source/blender/bmesh/tools/bmesh_intersect.h
index d0cc41654eb..51926a01710 100644
--- a/source/blender/bmesh/tools/bmesh_intersect.h
+++ b/source/blender/bmesh/tools/bmesh_intersect.h
@@ -30,7 +30,7 @@ bool BM_mesh_intersect(
         struct BMLoop *(*looptris)[3], const int looptris_tot,
         int (*test_fn)(BMFace *f, void *user_data), void *user_data,
         const bool use_self, const bool use_separate, const bool use_dissolve, const bool use_island_connect,
-        const int boolean_mode,
+        const bool use_edge_tag, const int boolean_mode,
         const float eps);
 
 enum {
diff --git a/source/blender/collada/SkinInfo.cpp b/source/blender/collada/SkinInfo.cpp
index 7242a24523c..71875d6274a 100644
--- a/source/blender/collada/SkinInfo.cpp
+++ b/source/blender/collada/SkinInfo.cpp
@@ -230,7 +230,6 @@ void SkinInfo::link_armature(bContext *C, Object *ob, std::map<COLLADAFW::Unique
 	ModifierData *md = ED_object_modifier_add(NULL, bmain, scene, ob, NULL, eModifierType_Armature);
 	ArmatureModifierData *amd = (ArmatureModifierData *)md;
 	amd->object = ob_arm;
-	struct bArmature *armature = (bArmature *)ob_arm->data;
 
 #if 1
 	bc_set_parent(ob, ob_arm, C);
diff --git a/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp b/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
index e1ada9a8c39..5f78067220a 100644
--- a/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
+++ b/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
@@ -94,4 +94,10 @@ void ConvolutionEdgeFilterOperation::executePixel(float output[4], int x, int y,
 	output[2] = output[2] * value[0] + in2[2] * mval;
 	
 	output[3] = in2[3];
+
+	/* Make sure we don't return negative color. */
+	output[0] = max(output[0], 0.0f);
+	output[1] = max(output[1], 0.0f);
+	output[2] = max(output[2], 0.0f);
+	output[3] = max(output[3], 0.0f);
 }
diff --git a/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp b/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
index 68ec2be5ebd..6ac1ff9a1eb 100644
--- a/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
+++ b/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
@@ -107,6 +107,12 @@ void ConvolutionFilterOperation::executePixel(float output[4], int x, int y, voi
 	output[1] = output[1] * value[0] + in2[1] * mval;
 	output[2] = output[2] * value[0] + in2[2] * mval;
 	output[3] = output[3] * value[0] + in2[3] * mval;
+
+	/* Make sure we don't return negative color. */
+	output[0] = max(output[0], 0.0f);
+	output[1] = max(output[1], 0.0f);
+	output[2] = max(output[2], 0.0f);
+	output[3] = max(output[3], 0.0f);
 }
 
 bool ConvolutionFilterOperation::determineDependingAreaOfInterest(rcti *input, ReadBufferOperation *readOperation, rcti *output)
diff --git a/source/blender/compositor/operations/COM_GlareSimpleStarOperation.cpp b/source/blender/compositor/operations/COM_GlareSimpleStarOperation.cpp
index 957ac5af748..57aa3a1bac2 100644
--- a/source/blender/compositor/operations/COM_GlareSimpleStarOperation.cpp
+++ b/source/blender/compositor/operations/COM_GlareSimpleStarOperation.cpp
@@ -44,18 +44,18 @@ void GlareSimpleStarOperation::generateGlare(float *data, MemoryBuffer *inputTil
 				xp = x + i;
 				tbuf1->read(c, x, y);
 				mul_v3_fl(c, f1);
-				tbuf1->read(tc, (settings->angle ? xm : x), ym);
+				tbuf1->read(tc, (settings->star_45 ? xm : x), ym);
 				madd_v3_v3fl(c, tc, f2);
-				tbuf1->read(tc, (settings->angle ? xp : x), yp);
+				tbuf1->read(tc, (settings->star_45 ? xp : x), yp);
 				madd_v3_v3fl(c, tc, f2);
 				c[3] = 1.0f;
 				tbuf1->writePixel(x, y, c);
 
 				tbuf2->read(c, x, y);
 				mul_v3_fl(c, f1);
-				tbuf2->read(tc, xm, (settings->angle ? yp : y));
+				tbuf2->read(tc, xm, (settings->star_45 ? yp : y));
 				madd_v3_v3fl(c, tc, f2);
-				tbuf2->read(tc, xp, (settings->angle ? ym : y));
+				tbuf2->read(tc, xp, (settings->star_45 ? ym : y));
 				madd_v3_v3fl(c, tc, f2);
 				c[3] = 1.0f;
 				tbuf2->writePixel(x, y, c);
@@ -73,18 +73,18 @@ void GlareSimpleStarOperation::generateGlare(float *data, MemoryBuffer *inputTil
 				xp = x + i;
 				tbuf1->read(c, x, y);
 				mul_v3_fl(c, f1);
-				tbuf1->read(tc, (settings->angle ? xm : x), ym);
+				tbuf1->read(tc, (settings->star_45 ? xm : x), ym);
 				madd_v3_v3fl(c, tc, f2);
-				tbuf1->read(tc, (settings->angle ? xp : x), yp);
+				tbuf1->read(tc, (settings->star_45 ? xp : x), yp);
 				madd_v3_v3fl(c, tc, f2);
 				c[3] = 1.0f;
 				tbuf1->writePixel(x, y, c);
 
 				tbuf2->read(c, x, y);
 				mul_v3_fl(c, f1);
-				tbuf2->read(tc, xm, (settings->angle ? yp : y));
+				tbuf2->read(tc, xm, (settings->star_45 ? yp : y));
 				madd_v3_v3fl(c, tc, f2);
-				tbuf2->read(tc, xp, (settings->angle ? ym : y));
+				tbuf2->read(tc, xp, (settings->star_45 ? ym : y));
 				madd_v3_v3fl(c, tc, f2);
 				c[3] = 1.0f;
 				tbuf2->writePixel(x, y, c);
diff --git a/source/blender/compositor/operations/COM_GlareStreaksOperation.cpp b/source/blender/compositor/operations/COM_GlareStreaksOperation.cpp
index da6076337b4..535f2952e5d 100644
--- a/source/blender/compositor/operations/COM_GlareStreaksOperation.cpp
+++ b/source/blender/compositor/operations/COM_GlareStreaksOperation.cpp
@@ -28,7 +28,7 @@ void GlareStreaksOperation::generateGlare(float *data, MemoryBuffer *inputTile,
 	int x, y, n;
 	unsigned int nump = 0;
 	float c1[4], c2[4], c3[4], c4[4];
-	float a, ang = DEG2RADF(360.0f) / (float)settings->angle;
+	float a, ang = DEG2RADF(360.0f) / (float)settings->streaks;
 
 	int size = inputTile->getWidth() * inputTile->getHeight();
 	int size4 = size * 4;
diff --git a/source/blender/depsgraph/intern/eval/deg_eval.cc b/source/blender/depsgraph/intern/eval/deg_eval.cc
index 3a042535d26..e739bc9dbb5 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval.cc
+++ b/source/blender/depsgraph/intern/eval/deg_eval.cc
@@ -95,105 +95,38 @@ static void deg_task_run_func(TaskPool *pool,
 	/* Should only be the case for NOOPs, which never get to this point. */
 	BLI_assert(node->evaluate);
 
-	while (true) {
-		/* Get context. */
-		/* TODO: Who initialises this? "Init" operations aren't able to
-		 * initialise it!!!
-		 */
-		/* TODO(sergey): We don't use component contexts at this moment. */
-		/* ComponentDepsNode *comp = node->owner; */
-		BLI_assert(node->owner != NULL);
-
-		/* Since we're not leaving the thread for until the graph branches it is
-		 * possible to have NO-OP on the way. for which evaluate() will be NULL.
-		 * but that's all fine, we'll just scheduler it's children.
-		 */
-		if (node->evaluate) {
+	/* Get context. */
+	/* TODO: Who initialises this? "Init" operations aren't able to
+	 * initialise it!!!
+	 */
+	/* TODO(sergey): We don't use component contexts at this moment. */
+	/* ComponentDepsNode *comp = node->owner; */
+	BLI_assert(node->owner != NULL);
+
+	/* Since we're not leaving the thread for until the graph branches it is
+	 * possible to have NO-OP on the way. for which evaluate() will be NULL.
+	 * but that's all fine, we'll just scheduler it's children.
+	 */
+	if (node->evaluate) {
 			/* Take note of current time. */
 #ifdef USE_DEBUGGER
-			double start_time = PIL_check_seconds_timer();
-			DepsgraphDebug::task_started(state->graph, node);
+		double start_time = PIL_check_seconds_timer();
+		DepsgraphDebug::task_started(state->graph, node);
 #endif
 
-			/* Perform operation. */
-			node->evaluate(state->eval_ctx);
+		/* Perform operation. */
+		node->evaluate(state->eval_ctx);
 
 			/* Note how long this took. */
 #ifdef USE_DEBUGGER
-			double end_time = PIL_check_seconds_timer();
-			DepsgraphDebug::task_completed(state->graph,
-			                               node,
-			                               end_time - start_time);
+		double end_time = PIL_check_seconds_timer();
+		DepsgraphDebug::task_completed(state->graph,
+		                               node,
+		                               end_time - start_time);
 #endif
-		}
-
-		/* If there's only one outgoing link we try to immediately switch to
-		 * that node evaluation, without leaving the thread.
-		 *
-		 * It's only doable if the child don't have extra relations or all they
-		 * are satisfied.
-		 *
-		 * TODO(sergey): Checks here can be de-duplicated with the ones from
-		 * schedule_node(), however, how to do it nicely?
-		 */
-		if (node->outlinks.size() == 1) {
-			DepsRelation *rel = node->outlinks[0];
-			OperationDepsNode *child = (OperationDepsNode *)rel->to;
-			BLI_assert(child->type == DEPSNODE_TYPE_OPERATION);
-			if (!child->scheduled) {
-				unsigned int id_layers = child->owner->owner->layers;
-				if (!((child->flag & DEPSOP_FLAG_NEEDS_UPDATE) != 0 &&
-				      (id_layers & state->layers) != 0))
-				{
-					/* Node does not need an update, so can;t continue with the
-					 * chain and need to switch to another one by leaving the
-					 * thread.
-					 */
-					break;
-				}
-				if ((rel->flag & DEPSREL_FLAG_CYCLIC) == 0) {
-					BLI_assert(child->num_links_pending > 0);
-					atomic_sub_and_fetch_uint32(&child->num_links_pending, 1);
-				}
-				if (child->num_links_pending == 0) {
-					bool is_scheduled = atomic_fetch_and_or_uint8(
-					        (uint8_t *)&child->scheduled, (uint8_t)true);
-					if (!is_scheduled) {
-						/* Node was not scheduled, switch to it! */
-						node = child;
-					}
-					else {
-						/* Someone else scheduled the node, leaving us
-						 * unemployed in this thread, we're leaving.
-						 */
-						break;
-					}
-				}
-				else {
-					/* There are other dependencies on the child, can't do
-					 * anything in the current thread.
-					 */
-					break;
-				}
-			}
-			else {
-				/* Happens when having cyclic dependencies.
-				 *
-				 * Nothing to do here, single child was already scheduled, we
-				 * can leave the thread now.
-				 */
-				break;
-			}
-		}
-		else {
-			/* TODO(sergey): It's possible to use one of the outgoing relations
-			 * as a chain which we'll try to keep alive, but it's a bit more
-			 * involved change.
-			 */
-			schedule_children(pool, state->graph, node, state->layers, thread_id);
-			break;
-		}
 	}
+
+	schedule_children(pool, state->graph, node, state->layers, thread_id);
 }
 
 typedef struct CalculatePengindData {
@@ -378,12 +311,19 @@ void deg_evaluate_on_refresh(EvaluationContext *eval_ctx,
 	state.graph = graph;
 	state.layers = layers;
 
-	TaskScheduler *task_scheduler = BLI_task_scheduler_get();
-	TaskPool *task_pool = BLI_task_pool_create(task_scheduler, &state);
+	TaskScheduler *task_scheduler;
+	bool need_free_scheduler;
 
 	if (G.debug & G_DEBUG_DEPSGRAPH_NO_THREADS) {
-		BLI_pool_set_num_threads(task_pool, 1);
+		task_scheduler = BLI_task_scheduler_create(1);
+		need_free_scheduler = true;
 	}
+	else {
+		task_scheduler = BLI_task_scheduler_get();
+		need_free_scheduler = false;
+	}
+
+	TaskPool *task_pool = BLI_task_pool_create_suspended(task_scheduler, &state);
 
 	calculate_pending_parents(graph, layers);
 
@@ -410,6 +350,10 @@ void deg_evaluate_on_refresh(EvaluationContext *eval_ctx,
 
 	/* Clear any uncleared tags - just in case. */
 	deg_graph_clear_tags(graph);
+
+	if (need_free_scheduler) {
+		BLI_task_scheduler_free(task_scheduler);
+	}
 }
 
 }  // namespace DEG
diff --git a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
index 7c6c25bef0d..e10f86f6e95 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
+++ b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
@@ -180,6 +180,11 @@ void deg_graph_flush_updates(Main *bmain, Depsgraph *graph)
 			comp_node->done = 1;
 
 			/* Flush to nodes along links... */
+			/* TODO(sergey): This is mainly giving speedup due ot less queue pushes, which
+			 * reduces number of memory allocations.
+			 *
+			 * We should try solve the allocation issue instead of doing crazy things here.
+			 */
 			if (node->outlinks.size() == 1) {
 				OperationDepsNode *to_node = (OperationDepsNode *)node->outlinks[0]->to;
 				if (to_node->scheduled == false) {
diff --git a/source/blender/editors/animation/anim_channels_defines.c b/source/blender/editors/animation/anim_channels_defines.c
index 57302c18a88..4d4f8c1298a 100644
--- a/source/blender/editors/animation/anim_channels_defines.c
+++ b/source/blender/editors/animation/anim_channels_defines.c
@@ -3856,7 +3856,8 @@ void ANIM_channel_draw(bAnimContext *ac, bAnimListElem *ale, float yminc, float
 	if (ac->sl) {
 		if ((ac->spacetype == SPACE_IPO) &&
 		    (acf->has_setting(ac, ale, ACHANNEL_SETTING_VISIBLE) ||
-		     acf->has_setting(ac, ale, ACHANNEL_SETTING_ALWAYS_VISIBLE))) {
+		     acf->has_setting(ac, ale, ACHANNEL_SETTING_ALWAYS_VISIBLE)))
+		{
 			/* for F-Curves, draw color-preview of curve behind checkbox */
 			if (ELEM(ale->type, ANIMTYPE_FCURVE, ANIMTYPE_NLACURVE)) {
 				FCurve *fcu = (FCurve *)ale->data;
diff --git a/source/blender/editors/animation/anim_draw.c b/source/blender/editors/animation/anim_draw.c
index a4ba95420c1..98900812bb2 100644
--- a/source/blender/editors/animation/anim_draw.c
+++ b/source/blender/editors/animation/anim_draw.c
@@ -117,7 +117,8 @@ void ANIM_draw_cfra(const bContext *C, View2D *v2d, short flag)
 	/* Draw a light green line to indicate current frame */
 	UI_ThemeColor(TH_CFRAME);
 
-	const float x = (float)(scene->r.cfra * scene->r.framelen);
+	const float time = scene->r.cfra + scene->r.subframe;
+	const float x = (float)(time * scene->r.framelen);
 
 	glLineWidth((flag & DRAWCFRA_WIDE) ? 3.0 : 2.0);
 
diff --git a/source/blender/editors/animation/anim_ops.c b/source/blender/editors/animation/anim_ops.c
index c0d6963acbb..bb73cbf03b4 100644
--- a/source/blender/editors/animation/anim_ops.c
+++ b/source/blender/editors/animation/anim_ops.c
@@ -95,7 +95,7 @@ static void change_frame_apply(bContext *C, wmOperator *op)
 {
 	Main *bmain = CTX_data_main(C);
 	Scene *scene = CTX_data_scene(C);
-	int frame = RNA_int_get(op->ptr, "frame");
+	float frame = RNA_float_get(op->ptr, "frame");
 	bool do_snap = RNA_boolean_get(op->ptr, "snap");
 
 	if (do_snap && CTX_wm_space_seq(C)) {
@@ -103,10 +103,15 @@ static void change_frame_apply(bContext *C, wmOperator *op)
 	}
 
 	/* set the new frame number */
-	CFRA = frame;
+	CFRA = (int)frame;
+	if (scene->r.flag & SCER_SHOW_SUBFRAME) {
+		SUBFRA = frame - (int)frame;
+	}
+	else {
+		SUBFRA = 0.0f;
+	}
 	FRAMENUMBER_MIN_CLAMP(CFRA);
-	SUBFRA = 0.0f;
-	
+
 	/* do updates */
 	BKE_sound_seek_scene(bmain, scene);
 	WM_event_add_notifier(C, NC_SCENE | ND_FRAME, scene);
@@ -125,18 +130,18 @@ static int change_frame_exec(bContext *C, wmOperator *op)
 /* ---- */
 
 /* Get frame from mouse coordinates */
-static int frame_from_event(bContext *C, const wmEvent *event)
+static float frame_from_event(bContext *C, const wmEvent *event)
 {
 	ARegion *region = CTX_wm_region(C);
 	Scene *scene = CTX_data_scene(C);
 	float viewx;
-	int frame;
+	float frame;
 
 	/* convert from region coordinates to View2D 'tot' space */
 	viewx = UI_view2d_region_to_view_x(&region->v2d, event->mval[0]);
 	
 	/* round result to nearest int (frames are ints!) */
-	frame = iroundf(viewx);
+	frame = viewx;
 	
 	if (scene->r.flag & SCER_LOCK_FRAME_SELECTION) {
 		CLAMP(frame, PSFRA, PEFRA);
@@ -187,7 +192,7 @@ static int change_frame_invoke(bContext *C, wmOperator *op, const wmEvent *event
 	 * as user could click on a single frame (jump to frame) as well as
 	 * click-dragging over a range (modal scrubbing).
 	 */
-	RNA_int_set(op->ptr, "frame", frame_from_event(C, event));
+	RNA_float_set(op->ptr, "frame", frame_from_event(C, event));
 
 	change_frame_seq_preview_begin(C, event);
 
@@ -215,7 +220,7 @@ static int change_frame_modal(bContext *C, wmOperator *op, const wmEvent *event)
 			break;
 
 		case MOUSEMOVE:
-			RNA_int_set(op->ptr, "frame", frame_from_event(C, event));
+			RNA_float_set(op->ptr, "frame", frame_from_event(C, event));
 			change_frame_apply(C, op);
 			break;
 		
@@ -268,7 +273,7 @@ static void ANIM_OT_change_frame(wmOperatorType *ot)
 	ot->undo_group = "FRAME_CHANGE";
 
 	/* rna */
-	ot->prop = RNA_def_int(ot->srna, "frame", 0, MINAFRAME, MAXFRAME, "Frame", "", MINAFRAME, MAXFRAME);
+	ot->prop = RNA_def_float(ot->srna, "frame", 0, MINAFRAME, MAXFRAME, "Frame", "", MINAFRAME, MAXFRAME);
 	prop = RNA_def_boolean(ot->srna, "snap", false, "Snap", "");
 	RNA_def_property_flag(prop, PROP_SKIP_SAVE);
 }
diff --git a/source/blender/editors/armature/armature_intern.h b/source/blender/editors/armature/armature_intern.h
index b39b4bd81ee..190b0610059 100644
--- a/source/blender/editors/armature/armature_intern.h
+++ b/source/blender/editors/armature/armature_intern.h
@@ -247,8 +247,10 @@ void armature_select_mirrored_ex(struct bArmature *arm, const int flag);
 void armature_select_mirrored(struct bArmature *arm);
 void armature_tag_unselect(struct bArmature *arm);
 
-void *get_nearest_bone(struct bContext *C, short findunsel, int x, int y);
-void *get_bone_from_selectbuffer(struct Scene *scene, struct Base *base, unsigned int *buffer, short hits, short findunsel, bool do_nearest);
+void *get_nearest_bone(struct bContext *C, const int xy[2], bool findunsel);
+void *get_bone_from_selectbuffer(
+        struct Scene *scene, struct Base *base, const unsigned int *buffer, short hits,
+        bool findunsel, bool do_nearest);
 
 int bone_looper(struct Object *ob, struct Bone *bone, void *data,
                 int (*bone_func)(struct Object *, struct Bone *, void *));
diff --git a/source/blender/editors/armature/armature_naming.c b/source/blender/editors/armature/armature_naming.c
index fa192ed6f36..c928508237d 100644
--- a/source/blender/editors/armature/armature_naming.c
+++ b/source/blender/editors/armature/armature_naming.c
@@ -362,7 +362,7 @@ static int armature_flip_names_exec(bContext *C, wmOperator *UNUSED(op))
 
 	arm = ob->data;
 
-	ListBase bones_names= {NULL};
+	ListBase bones_names = {NULL};
 
 	CTX_DATA_BEGIN(C, EditBone *, ebone, selected_editable_bones)
 	{
diff --git a/source/blender/editors/armature/armature_select.c b/source/blender/editors/armature/armature_select.c
index e9946abba0b..d19862cb4b0 100644
--- a/source/blender/editors/armature/armature_select.c
+++ b/source/blender/editors/armature/armature_select.c
@@ -53,6 +53,8 @@
 #include "ED_screen.h"
 #include "ED_view3d.h"
 
+#include "GPU_select.h"
+
 #include "armature_intern.h"
 
 /* utility macros for storing a temp int in the bone (selection flag) */
@@ -74,7 +76,9 @@ Bone *get_indexed_bone(Object *ob, int index)
 
 /* See if there are any selected bones in this buffer */
 /* only bones from base are checked on */
-void *get_bone_from_selectbuffer(Scene *scene, Base *base, unsigned int *buffer, short hits, short findunsel, bool do_nearest)
+void *get_bone_from_selectbuffer(
+        Scene *scene, Base *base, const unsigned int *buffer, short hits,
+        bool findunsel, bool do_nearest)
 {
 	Object *obedit = scene->obedit; // XXX get from context
 	Bone *bone;
@@ -103,8 +107,8 @@ void *get_bone_from_selectbuffer(Scene *scene, Base *base, unsigned int *buffer,
 							sel = (bone->flag & BONE_SELECTED);
 						else
 							sel = !(bone->flag & BONE_SELECTED);
-						
-						data = bone;						
+
+						data = bone;
 					}
 					else {
 						data = NULL;
@@ -162,7 +166,7 @@ void *get_bone_from_selectbuffer(Scene *scene, Base *base, unsigned int *buffer,
 /* used by posemode as well editmode */
 /* only checks scene->basact! */
 /* x and y are mouse coords (area space) */
-void *get_nearest_bone(bContext *C, short findunsel, int x, int y)
+void *get_nearest_bone(bContext *C, const int xy[2], bool findunsel)
 {
 	ViewContext vc;
 	rcti rect;
@@ -172,10 +176,10 @@ void *get_nearest_bone(bContext *C, short findunsel, int x, int y)
 	view3d_set_viewcontext(C, &vc);
 	
 	// rect.xmin = ... mouseco!
-	rect.xmin = rect.xmax = x;
-	rect.ymin = rect.ymax = y;
+	rect.xmin = rect.xmax = xy[0];
+	rect.ymin = rect.ymax = xy[1];
 	
-	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true);
+	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, VIEW3D_SELECT_PICK_NEAREST);
 
 	if (hits > 0)
 		return get_bone_from_selectbuffer(vc.scene, vc.scene->basact, buffer, hits, findunsel, true);
@@ -197,10 +201,7 @@ static int armature_select_linked_invoke(bContext *C, wmOperator *op, const wmEv
 
 	view3d_operator_needs_opengl(C);
 
-	if (extend)
-		bone = get_nearest_bone(C, 0, event->mval[0], event->mval[1]);
-	else
-		bone = get_nearest_bone(C, 1, event->mval[0], event->mval[1]);
+	bone = get_nearest_bone(C, event->mval, !extend);
 
 	if (!bone)
 		return OPERATOR_CANCELLED;
@@ -276,10 +277,24 @@ void ARMATURE_OT_select_linked(wmOperatorType *ot)
 	RNA_def_boolean(ot->srna, "extend", false, "Extend", "Extend selection instead of deselecting everything first");
 }
 
+/* utility function for get_nearest_editbonepoint */
+static int selectbuffer_ret_hits_12(unsigned int *UNUSED(buffer), const int hits12)
+{
+	return hits12;
+}
+
+static int selectbuffer_ret_hits_5(unsigned int *buffer, const int hits12, const int hits5)
+{
+	const int offs = 4 * hits12;
+	memcpy(buffer, buffer + offs, 4 * hits5 * sizeof(unsigned int));
+	return hits5;
+}
+
 /* does bones and points */
 /* note that BONE ROOT only gets drawn for root bones (or without IK) */
-static EditBone *get_nearest_editbonepoint(ViewContext *vc, const int mval[2],
-                                           ListBase *edbo, int findunsel, int *selmask)
+static EditBone *get_nearest_editbonepoint(
+        ViewContext *vc, const int mval[2],
+        ListBase *edbo, bool findunsel, bool use_cycle, int *r_selmask)
 {
 	bArmature *arm = (bArmature *)vc->obedit->data;
 	EditBone *ebone_next_act = arm->act_edbone;
@@ -289,7 +304,9 @@ static EditBone *get_nearest_editbonepoint(ViewContext *vc, const int mval[2],
 	unsigned int buffer[MAXPICKBUF];
 	unsigned int hitresult, besthitresult = BONESEL_NOSEL;
 	int i, mindep = 5;
-	short hits;
+	int hits12, hits5 = 0;
+
+	static int last_mval[2] = {-100, -100};
 
 	/* find the bone after the current active bone, so as to bump up its chances in selection.
 	 * this way overlapping bones will cycle selection state as with objects. */
@@ -303,22 +320,59 @@ static EditBone *get_nearest_editbonepoint(ViewContext *vc, const int mval[2],
 		ebone_next_act = NULL;
 	}
 
-	rect.xmin = mval[0] - 5;
-	rect.xmax = mval[0] + 5;
-	rect.ymin = mval[1] - 5;
-	rect.ymax = mval[1] + 5;
-
-	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, true);
-	if (hits == 0) {
-		rect.xmin = mval[0] - 12;
-		rect.xmax = mval[0] + 12;
-		rect.ymin = mval[1] - 12;
-		rect.ymax = mval[1] + 12;
-		hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, true);
+	bool do_nearest = false;
+
+	/* define if we use solid nearest select or not */
+	if (use_cycle) {
+		if (vc->v3d->drawtype > OB_WIRE) {
+			do_nearest = true;
+			if (len_manhattan_v2v2_int(mval, last_mval) < 3) {
+				do_nearest = false;
+			}
+		}
+		copy_v2_v2_int(last_mval, mval);
+	}
+	else {
+		if (vc->v3d->drawtype > OB_WIRE) {
+			do_nearest = true;
+		}
+	}
+
+	/* matching logic from 'mixed_bones_object_selectbuffer' */
+	const int select_mode = (do_nearest ? VIEW3D_SELECT_PICK_NEAREST : VIEW3D_SELECT_PICK_ALL);
+	int hits = 0;
+
+	/* we _must_ end cache before return, use 'goto cache_end' */
+	GPU_select_cache_begin();
+
+	BLI_rcti_init_pt_radius(&rect, mval, 12);
+	hits12 = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, select_mode);
+	if (hits12 == 1) {
+		hits = selectbuffer_ret_hits_12(buffer, hits12);
+		goto cache_end;
+	}
+	else if (hits12 > 0) {
+		int offs;
+
+		offs = 4 * hits12;
+		BLI_rcti_init_pt_radius(&rect, mval, 5);
+		hits5 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, select_mode);
+
+		if (hits5 == 1) {
+			hits = selectbuffer_ret_hits_5(buffer, hits12, hits5);
+			goto cache_end;
+		}
+
+		if      (hits5 > 0) { hits = selectbuffer_ret_hits_5(buffer,  hits12, hits5); goto cache_end; }
+		else                { hits = selectbuffer_ret_hits_12(buffer, hits12); goto cache_end; }
 	}
+
+cache_end:
+	GPU_select_cache_end();
+
 	/* See if there are any selected bones in this group */
 	if (hits > 0) {
-		
+
 		if (hits == 1) {
 			if (!(buffer[3] & BONESEL_NOSEL))
 				besthitresult = buffer[3];
@@ -375,17 +429,17 @@ static EditBone *get_nearest_editbonepoint(ViewContext *vc, const int mval[2],
 			
 			ebone = BLI_findlink(edbo, besthitresult & ~BONESEL_ANY);
 			
-			*selmask = 0;
+			*r_selmask = 0;
 			if (besthitresult & BONESEL_ROOT)
-				*selmask |= BONE_ROOTSEL;
+				*r_selmask |= BONE_ROOTSEL;
 			if (besthitresult & BONESEL_TIP)
-				*selmask |= BONE_TIPSEL;
+				*r_selmask |= BONE_TIPSEL;
 			if (besthitresult & BONESEL_BONE)
-				*selmask |= BONE_SELECTED;
+				*r_selmask |= BONE_SELECTED;
 			return ebone;
 		}
 	}
-	*selmask = 0;
+	*r_selmask = 0;
 	return NULL;
 }
 
@@ -439,8 +493,8 @@ bool ED_armature_select_pick(bContext *C, const int mval[2], bool extend, bool d
 	if (BIF_sk_selectStroke(C, mval, extend)) {
 		return true;
 	}
-	
-	nearBone = get_nearest_editbonepoint(&vc, mval, arm->edbo, 1, &selmask);
+
+	nearBone = get_nearest_editbonepoint(&vc, mval, arm->edbo, true, true, &selmask);
 	if (nearBone) {
 
 		if (!extend && !deselect && !toggle) {
@@ -1202,7 +1256,7 @@ static int armature_shortest_path_pick_invoke(bContext *C, wmOperator *op, const
 	view3d_operator_needs_opengl(C);
 
 	ebone_src = arm->act_edbone;
-	ebone_dst = get_nearest_bone(C, 0, event->mval[0], event->mval[1]);
+	ebone_dst = get_nearest_bone(C, event->mval, false);
 
 	/* fallback to object selection */
 	if (ELEM(NULL, ebone_src, ebone_dst) || (ebone_src == ebone_dst)) {
diff --git a/source/blender/editors/armature/editarmature_sketch.c b/source/blender/editors/armature/editarmature_sketch.c
index f6c04e9570a..bba486bc65c 100644
--- a/source/blender/editors/armature/editarmature_sketch.c
+++ b/source/blender/editors/armature/editarmature_sketch.c
@@ -1907,12 +1907,9 @@ static bool sk_selectStroke(bContext *C, SK_Sketch *sketch, const int mval[2], c
 
 	view3d_set_viewcontext(C, &vc);
 
-	rect.xmin = mval[0] - 5;
-	rect.xmax = mval[0] + 5;
-	rect.ymin = mval[1] - 5;
-	rect.ymax = mval[1] + 5;
+	BLI_rcti_init_pt_radius(&rect, mval, 5);
 
-	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true);
+	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, VIEW3D_SELECT_PICK_NEAREST);
 
 	if (hits > 0) {
 		int besthitresult = -1;
diff --git a/source/blender/editors/armature/pose_select.c b/source/blender/editors/armature/pose_select.c
index 44470c1f827..6e328552411 100644
--- a/source/blender/editors/armature/pose_select.c
+++ b/source/blender/editors/armature/pose_select.c
@@ -132,8 +132,9 @@ void ED_pose_bone_select(Object *ob, bPoseChannel *pchan, bool select)
 
 /* called from editview.c, for mode-less pose selection */
 /* assumes scene obact and basact is still on old situation */
-int ED_do_pose_selectbuffer(Scene *scene, Base *base, unsigned int *buffer, short hits,
-                            bool extend, bool deselect, bool toggle, bool do_nearest)
+bool ED_do_pose_selectbuffer(
+        Scene *scene, Base *base, const unsigned int *buffer, short hits,
+        bool extend, bool deselect, bool toggle, bool do_nearest)
 {
 	Object *ob = base->object;
 	Bone *nearBone;
@@ -280,12 +281,9 @@ static int pose_select_connected_invoke(bContext *C, wmOperator *op, const wmEve
 	const bool extend = RNA_boolean_get(op->ptr, "extend");
 
 	view3d_operator_needs_opengl(C);
-	
-	if (extend)
-		bone = get_nearest_bone(C, 0, event->mval[0], event->mval[1]);
-	else
-		bone = get_nearest_bone(C, 1, event->mval[0], event->mval[1]);
-	
+
+	bone = get_nearest_bone(C, event->mval, !extend);
+
 	if (!bone)
 		return OPERATOR_CANCELLED;
 	
diff --git a/source/blender/editors/curve/editcurve.c b/source/blender/editors/curve/editcurve.c
index e9fd5fb5a43..47f42ab5321 100644
--- a/source/blender/editors/curve/editcurve.c
+++ b/source/blender/editors/curve/editcurve.c
@@ -91,14 +91,6 @@ typedef struct {
 	int flag;
 } UndoCurve;
 
-/* Definitions needed for shape keys */
-typedef struct {
-	void *orig_cv;
-	int key_index, nu_index, pt_index, vertex_index;
-	bool switched;
-	Nurb *orig_nu;
-} CVKeyIndex;
-
 void selectend_nurb(Object *obedit, enum eEndPoint_Types selfirst, bool doswap, bool selstatus);
 static void adduplicateflagNurb(Object *obedit, ListBase *newnurb, const short flag, const bool split);
 static int curve_delete_segments(Object *obedit, const bool split);
@@ -138,9 +130,9 @@ void printknots(Object *obedit)
 
 /* ********************* Shape keys *************** */
 
-static CVKeyIndex *init_cvKeyIndex(void *cv, int key_index, int nu_index, int pt_index, int vertex_index, Nurb *orig_nu)
+static CVKeyIndex *init_cvKeyIndex(void *cv, int key_index, int nu_index, int pt_index, int vertex_index)
 {
-	CVKeyIndex *cvIndex = MEM_callocN(sizeof(CVKeyIndex), "init_cvKeyIndex");
+	CVKeyIndex *cvIndex = MEM_callocN(sizeof(CVKeyIndex), __func__);
 
 	cvIndex->orig_cv = cv;
 	cvIndex->key_index = key_index;
@@ -148,7 +140,6 @@ static CVKeyIndex *init_cvKeyIndex(void *cv, int key_index, int nu_index, int pt
 	cvIndex->pt_index = pt_index;
 	cvIndex->vertex_index = vertex_index;
 	cvIndex->switched = false;
-	cvIndex->orig_nu = orig_nu;
 
 	return cvIndex;
 }
@@ -174,7 +165,12 @@ static void init_editNurb_keyIndex(EditNurb *editnurb, ListBase *origBase)
 			origbezt = orignu->bezt;
 			pt_index = 0;
 			while (a--) {
-				keyIndex = init_cvKeyIndex(origbezt, key_index, nu_index, pt_index, vertex_index, orignu);
+				/* We cannot keep *any* reference to curve obdata,
+				 * it might be replaced and freed while editcurve remain in use (in viewport render case e.g.).
+				 * Note that we could use a pool to avoid lots of malloc's here, but... not really a problem for now. */
+				BezTriple *origbezt_cpy = MEM_mallocN(sizeof(*origbezt), __func__);
+				*origbezt_cpy = *origbezt;
+				keyIndex = init_cvKeyIndex(origbezt_cpy, key_index, nu_index, pt_index, vertex_index);
 				BLI_ghash_insert(gh, bezt, keyIndex);
 				key_index += 12;
 				vertex_index += 3;
@@ -189,7 +185,12 @@ static void init_editNurb_keyIndex(EditNurb *editnurb, ListBase *origBase)
 			origbp = orignu->bp;
 			pt_index = 0;
 			while (a--) {
-				keyIndex = init_cvKeyIndex(origbp, key_index, nu_index, pt_index, vertex_index, orignu);
+				/* We cannot keep *any* reference to curve obdata,
+				 * it might be replaced and freed while editcurve remain in use (in viewport render case e.g.).
+				 * Note that we could use a pool to avoid lots of malloc's here, but... not really a problem for now. */
+				BPoint *origbp_cpy = MEM_mallocN(sizeof(*origbp_cpy), __func__);
+				*origbp_cpy = *origbp;
+				keyIndex = init_cvKeyIndex(origbp_cpy, key_index, nu_index, pt_index, vertex_index);
 				BLI_ghash_insert(gh, bp, keyIndex);
 				key_index += 4;
 				bp++;
@@ -250,23 +251,22 @@ static int getKeyIndexOrig_keyIndex(EditNurb *editnurb, void *cv)
 	return index->key_index;
 }
 
-static void keyIndex_delCV(EditNurb *editnurb, const void *cv)
+static void keyIndex_delBezt(EditNurb *editnurb, BezTriple *bezt)
 {
 	if (!editnurb->keyindex) {
 		return;
 	}
 
-	BLI_ghash_remove(editnurb->keyindex, cv, NULL, MEM_freeN);
-}
-
-static void keyIndex_delBezt(EditNurb *editnurb, BezTriple *bezt)
-{
-	keyIndex_delCV(editnurb, bezt);
+	BKE_curve_editNurb_keyIndex_delCV(editnurb->keyindex, bezt);
 }
 
 static void keyIndex_delBP(EditNurb *editnurb, BPoint *bp)
 {
-	keyIndex_delCV(editnurb, bp);
+	if (!editnurb->keyindex) {
+		return;
+	}
+
+	BKE_curve_editNurb_keyIndex_delCV(editnurb->keyindex, bp);
 }
 
 static void keyIndex_delNurb(EditNurb *editnurb, Nurb *nu)
@@ -282,7 +282,7 @@ static void keyIndex_delNurb(EditNurb *editnurb, Nurb *nu)
 		a = nu->pntsu;
 
 		while (a--) {
-			BLI_ghash_remove(editnurb->keyindex, bezt, NULL, MEM_freeN);
+			BKE_curve_editNurb_keyIndex_delCV(editnurb->keyindex, bezt);
 			bezt++;
 		}
 	}
@@ -291,7 +291,7 @@ static void keyIndex_delNurb(EditNurb *editnurb, Nurb *nu)
 		a = nu->pntsu * nu->pntsv;
 
 		while (a--) {
-			BLI_ghash_remove(editnurb->keyindex, bp, NULL, MEM_freeN);
+			BKE_curve_editNurb_keyIndex_delCV(editnurb->keyindex, bp);
 			bp++;
 		}
 	}
@@ -535,6 +535,7 @@ static GHash *dupli_keyIndexHash(GHash *keyindex)
 		CVKeyIndex *newIndex = MEM_mallocN(sizeof(CVKeyIndex), "dupli_keyIndexHash index");
 
 		memcpy(newIndex, index, sizeof(CVKeyIndex));
+		newIndex->orig_cv = MEM_dupallocN(index->orig_cv);
 
 		BLI_ghash_insert(gh, cv, newIndex);
 	}
@@ -624,7 +625,7 @@ static void calc_keyHandles(ListBase *nurb, float *key)
 	}
 }
 
-static void calc_shapeKeys(Object *obedit)
+static void calc_shapeKeys(Object *obedit, ListBase *newnurbs)
 {
 	Curve *cu = (Curve *)obedit->data;
 
@@ -636,7 +637,7 @@ static void calc_shapeKeys(Object *obedit)
 		KeyBlock *actkey = BLI_findlink(&cu->key->block, editnurb->shapenr - 1);
 		BezTriple *bezt, *oldbezt;
 		BPoint *bp, *oldbp;
-		Nurb *nu;
+		Nurb *nu, *newnu;
 		int totvert = BKE_nurbList_verts_count(&editnurb->nurbs);
 
 		float (*ofs)[3] = NULL;
@@ -706,20 +707,25 @@ static void calc_shapeKeys(Object *obedit)
 
 		currkey = cu->key->block.first;
 		while (currkey) {
-			int apply_offset = (ofs && (currkey != actkey) && (editnurb->shapenr - 1 == currkey->relative));
+			const bool apply_offset = (ofs && (currkey != actkey) && (editnurb->shapenr - 1 == currkey->relative));
 
 			float *fp = newkey = MEM_callocN(cu->key->elemsize * totvert,  "currkey->data");
 			ofp = oldkey = currkey->data;
 
 			nu = editnurb->nurbs.first;
+			/* We need to restore to original curve into newnurb, *not* editcurve's nurbs.
+			 * Otherwise, in case we update obdata *without* leaving editmode (e.g. viewport render), we would
+			 * invalidate editcurve. */
+			newnu = newnurbs->first;
 			i = 0;
 			while (nu) {
 				if (currkey == actkey) {
-					int restore = actkey != cu->key->refkey;
+					const bool restore = actkey != cu->key->refkey;
 
 					if (nu->bezt) {
 						bezt = nu->bezt;
 						a = nu->pntsu;
+						BezTriple *newbezt = newnu->bezt;
 						while (a--) {
 							int j;
 							oldbezt = getKeyIndexOrig_bezt(editnurb, bezt);
@@ -728,7 +734,7 @@ static void calc_shapeKeys(Object *obedit)
 								copy_v3_v3(fp, bezt->vec[j]);
 
 								if (restore && oldbezt) {
-									copy_v3_v3(bezt->vec[j], oldbezt->vec[j]);
+									copy_v3_v3(newbezt->vec[j], oldbezt->vec[j]);
 								}
 
 								fp += 3;
@@ -736,16 +742,18 @@ static void calc_shapeKeys(Object *obedit)
 							fp[0] = bezt->alfa;
 
 							if (restore && oldbezt) {
-								bezt->alfa = oldbezt->alfa;
+								newbezt->alfa = oldbezt->alfa;
 							}
 
 							fp += 3; ++i; /* alphas */
 							bezt++;
+							newbezt++;
 						}
 					}
 					else {
 						bp = nu->bp;
 						a = nu->pntsu * nu->pntsv;
+						BPoint *newbp = newnu->bp;
 						while (a--) {
 							oldbp = getKeyIndexOrig_bp(editnurb, bp);
 
@@ -754,12 +762,13 @@ static void calc_shapeKeys(Object *obedit)
 							fp[3] = bp->alfa;
 
 							if (restore && oldbp) {
-								copy_v3_v3(bp->vec, oldbp->vec);
-								bp->alfa = oldbp->alfa;
+								copy_v3_v3(newbp->vec, oldbp->vec);
+								newbp->alfa = oldbp->alfa;
 							}
 
 							fp += 4;
 							bp++;
+							newbp++;
 							i += 2;
 						}
 					}
@@ -1204,9 +1213,13 @@ void ED_curve_editnurb_load(Object *obedit)
 			}
 		}
 
+		/* We have to pass also new copied nurbs, since we want to restore original curve (without edited shapekey)
+		 * on obdata, but *not* on editcurve itself (ED_curve_editnurb_load call does not always implies freeing
+		 * of editcurve, e.g. when called to generate render data...). */
+		calc_shapeKeys(obedit, &newnurb);
+
 		cu->nurb = newnurb;
 
-		calc_shapeKeys(obedit);
 		ED_curve_updateAnimPaths(obedit->data);
 
 		BKE_nurbList_free(&oldnurb);
@@ -1227,13 +1240,11 @@ void ED_curve_editnurb_make(Object *obedit)
 		if (actkey) {
 			// XXX strcpy(G.editModeTitleExtra, "(Key) ");
 			undo_editmode_clear();
-			BKE_keyblock_convert_to_curve(actkey, cu, &cu->nurb);
 		}
 
 		if (editnurb) {
 			BKE_nurbList_free(&editnurb->nurbs);
-			BKE_curve_editNurb_keyIndex_free(editnurb);
-			editnurb->keyindex = NULL;
+			BKE_curve_editNurb_keyIndex_free(&editnurb->keyindex);
 		}
 		else {
 			editnurb = MEM_callocN(sizeof(EditNurb), "editnurb");
@@ -1248,12 +1259,16 @@ void ED_curve_editnurb_make(Object *obedit)
 			nu = nu->next;
 		}
 
-		if (actkey)
-			editnurb->shapenr = obedit->shapenr;
-
 		/* animation could be added in editmode even if there was no animdata in
 		 * object mode hence we always need CVs index be created */
 		init_editNurb_keyIndex(editnurb, &cu->nurb);
+
+		if (actkey) {
+			editnurb->shapenr = obedit->shapenr;
+			/* Apply shapekey to new nurbs of editnurb, not those of original curve (and *after* we generated keyIndex),
+			 * else we do not have valid 'original' data to properly restore curve when leaving editmode. */
+			BKE_keyblock_convert_to_curve(actkey, cu, &editnurb->nurbs);
+		}
 	}
 }
 
@@ -1309,8 +1324,7 @@ static int separate_exec(bContext *C, wmOperator *op)
 	ED_curve_editnurb_make(newob);
 	newedit = newcu->editnurb;
 	BKE_nurbList_free(&newedit->nurbs);
-	BKE_curve_editNurb_keyIndex_free(newedit);
-	newedit->keyindex = NULL;
+	BKE_curve_editNurb_keyIndex_free(&newedit->keyindex);
 	BLI_movelisttolist(&newedit->nurbs, &newnurb);
 
 	/* 4. put old object out of editmode and delete separated geometry */
@@ -6110,7 +6124,7 @@ static void undoCurve_to_editCurve(void *ucu, void *UNUSED(edata), void *cu_v)
 	BKE_nurbList_free(editbase);
 
 	if (undoCurve->undoIndex) {
-		BLI_ghash_free(editnurb->keyindex, NULL, MEM_freeN);
+		BKE_curve_editNurb_keyIndex_free(&editnurb->keyindex);
 		editnurb->keyindex = dupli_keyIndexHash(undoCurve->undoIndex);
 	}
 
@@ -6188,8 +6202,7 @@ static void free_undoCurve(void *ucv)
 
 	BKE_nurbList_free(&undoCurve->nubase);
 
-	if (undoCurve->undoIndex)
-		BLI_ghash_free(undoCurve->undoIndex, NULL, MEM_freeN);
+	BKE_curve_editNurb_keyIndex_free(&undoCurve->undoIndex);
 
 	free_fcurves(&undoCurve->fcurves);
 	free_fcurves(&undoCurve->drivers);
diff --git a/source/blender/editors/gpencil/gpencil_edit.c b/source/blender/editors/gpencil/gpencil_edit.c
index e118e490f25..fa9acc36a2b 100644
--- a/source/blender/editors/gpencil/gpencil_edit.c
+++ b/source/blender/editors/gpencil/gpencil_edit.c
@@ -74,7 +74,6 @@
 #include "ED_object.h"
 #include "ED_screen.h"
 #include "ED_view3d.h"
-#include "ED_screen.h"
 #include "ED_space_api.h"
 
 #include "gpencil_intern.h"
diff --git a/source/blender/editors/include/ED_armature.h b/source/blender/editors/include/ED_armature.h
index 7ad61671b1b..6b8943421bd 100644
--- a/source/blender/editors/include/ED_armature.h
+++ b/source/blender/editors/include/ED_armature.h
@@ -130,8 +130,9 @@ void ED_armature_ebone_listbase_temp_clear(struct ListBase *lb);
 void ED_armature_deselect_all(struct Object *obedit);
 void ED_armature_deselect_all_visible(struct Object *obedit);
 
-int ED_do_pose_selectbuffer(struct Scene *scene, struct Base *base, unsigned int *buffer,
-                            short hits, bool extend, bool deselect, bool toggle, bool do_nearest);
+bool ED_do_pose_selectbuffer(
+        struct Scene *scene, struct Base *base, const unsigned int *buffer, short hits,
+        bool extend, bool deselect, bool toggle, bool do_nearest);
 bool ED_armature_select_pick(struct bContext *C, const int mval[2], bool extend, bool deselect, bool toggle);
 int join_armature_exec(struct bContext *C, struct wmOperator *op);
 struct Bone *get_indexed_bone(struct Object *ob, int index);
diff --git a/source/blender/editors/include/ED_view3d.h b/source/blender/editors/include/ED_view3d.h
index 79176d9e9cf..af6f37d937c 100644
--- a/source/blender/editors/include/ED_view3d.h
+++ b/source/blender/editors/include/ED_view3d.h
@@ -47,6 +47,7 @@ struct Main;
 struct MetaElem;
 struct Nurb;
 struct Object;
+struct RV3DMatrixStore;
 struct RegionView3D;
 struct Scene;
 struct ScrArea;
@@ -301,7 +302,19 @@ bool ED_view3d_autodist_depth_seg(struct ARegion *ar, const int mval_sta[2], con
 /* select */
 #define MAXPICKELEMS    2500
 #define MAXPICKBUF      (4 * MAXPICKELEMS)
-short view3d_opengl_select(struct ViewContext *vc, unsigned int *buffer, unsigned int bufsize, const struct rcti *input, bool do_nearest);
+
+enum {
+	/* all elements in the region, ignore depth */
+	VIEW3D_SELECT_ALL = 0,
+	/* pick also depth sorts (only for small regions!) */
+	VIEW3D_SELECT_PICK_ALL = 1,
+	/* sorts and only returns visible objects (only for small regions!) */
+	VIEW3D_SELECT_PICK_NEAREST = 2,
+};
+
+int view3d_opengl_select(
+        struct ViewContext *vc, unsigned int *buffer, unsigned int bufsize, const struct rcti *input,
+        int select_mode);
 
 /* view3d_select.c */
 float ED_view3d_select_dist_px(void);
@@ -330,8 +343,8 @@ void ED_view3d_check_mats_rv3d(struct RegionView3D *rv3d);
 #endif
 int ED_view3d_scene_layer_set(int lay, const int *values, int *active);
 
-void *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d);
-void  ED_view3d_mats_rv3d_restore(struct RegionView3D *rv3d, void *rv3dmat_pt);
+struct RV3DMatrixStore *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d);
+void                    ED_view3d_mats_rv3d_restore(struct RegionView3D *rv3d, struct RV3DMatrixStore *rv3dmat);
 
 bool ED_view3d_context_activate(struct bContext *C);
 void ED_view3d_draw_offscreen_init(struct Scene *scene, struct View3D *v3d);
diff --git a/source/blender/editors/interface/interface_handlers.c b/source/blender/editors/interface/interface_handlers.c
index 734cd02a056..6e3c3c3674a 100644
--- a/source/blender/editors/interface/interface_handlers.c
+++ b/source/blender/editors/interface/interface_handlers.c
@@ -7741,7 +7741,8 @@ static void button_activate_state(bContext *C, uiBut *but, uiHandleButtonState s
 		if (ui_but_is_cursor_warp(but)) {
 
 #ifdef USE_CONT_MOUSE_CORRECT
-			if (data->ungrab_mval[0] != FLT_MAX) {
+			/* stereo3d has issues with changing cursor location so rather avoid */
+			if (data->ungrab_mval[0] != FLT_MAX && !WM_stereo3d_enabled(data->window, false)) {
 				int mouse_ungrab_xy[2];
 				ui_block_to_window_fl(data->region, but->block, &data->ungrab_mval[0], &data->ungrab_mval[1]);
 				mouse_ungrab_xy[0] = data->ungrab_mval[0];
diff --git a/source/blender/editors/interface/interface_layout.c b/source/blender/editors/interface/interface_layout.c
index ca2538022b0..ce1153911da 100644
--- a/source/blender/editors/interface/interface_layout.c
+++ b/source/blender/editors/interface/interface_layout.c
@@ -189,7 +189,7 @@ static const char *ui_item_name_add_colon(const char *name, char namestr[UI_MAX_
 	return name;
 }
 
-static int ui_item_fit(int item, int pos, int all, int available, bool is_last, int alignment)
+static int ui_item_fit(int item, int pos, int all, int available, bool is_last, int alignment, float *extra_pixel)
 {
 	/* available == 0 is unlimited */
 	if (available == 0)
@@ -199,16 +199,22 @@ static int ui_item_fit(int item, int pos, int all, int available, bool is_last,
 		/* contents is bigger than available space */
 		if (is_last)
 			return available - pos;
-		else
-			return (item * available) / all;
+		else {
+			float width = *extra_pixel + (item * available) / (float)all;
+			*extra_pixel = width - (int)width;
+			return (int)width;
+		}
 	}
 	else {
 		/* contents is smaller or equal to available space */
 		if (alignment == UI_LAYOUT_ALIGN_EXPAND) {
 			if (is_last)
 				return available - pos;
-			else
-				return (item * available) / all;
+			else {
+				float width = *extra_pixel + (item * available) / (float)all;
+				*extra_pixel = width - (int)width;
+				return (int)width;
+			}
 		}
 		else
 			return item;
@@ -302,6 +308,26 @@ static void ui_item_position(uiItem *item, int x, int y, int w, int h)
 	}
 }
 
+static void ui_item_move(uiItem *item, int delta_xmin, int delta_xmax)
+{
+	if (item->type == ITEM_BUTTON) {
+		uiButtonItem *bitem = (uiButtonItem *)item;
+
+		bitem->but->rect.xmin += delta_xmin;
+		bitem->but->rect.xmax += delta_xmax;
+		
+		ui_but_update(bitem->but); /* for strlen */
+	}
+	else {
+		uiLayout *litem = (uiLayout *)item;
+
+		if (delta_xmin > 0)
+			litem->x += delta_xmin;
+		else
+			litem->w += delta_xmax;
+	}
+}
+
 /******************** Special RNA Items *********************/
 
 static int ui_layout_local_dir(uiLayout *layout)
@@ -1248,7 +1274,7 @@ static void ui_item_rna_size(
 	if (!w) {
 		if (type == PROP_ENUM && icon_only) {
 			w = ui_text_icon_width(layout, "", ICON_BLANK1, 0);
-			w += 0.6f * UI_UNIT_X;
+			w += 0.5f * UI_UNIT_X;
 		}
 		else {
 			w = ui_text_icon_width(layout, name, icon, 0);
@@ -2099,9 +2125,10 @@ static int ui_litem_min_width(int itemw)
 
 static void ui_litem_layout_row(uiLayout *litem)
 {
-	uiItem *item;
+	uiItem *item, *last_free_item = NULL;
 	int x, y, w, tot, totw, neww, newtotw, itemw, minw, itemh, offset;
 	int fixedw, freew, fixedx, freex, flag = 0, lastw = 0;
+	float extra_pixel;
 
 	/* x = litem->x; */ /* UNUSED */
 	y = litem->y;
@@ -2128,6 +2155,7 @@ static void ui_litem_layout_row(uiLayout *litem)
 		x = 0;
 		flag = 0;
 		newtotw = totw;
+		extra_pixel = 0.0f;
 
 		for (item = litem->items.first; item; item = item->next) {
 			if (item->flag & UI_ITEM_FIXED)
@@ -2137,7 +2165,7 @@ static void ui_litem_layout_row(uiLayout *litem)
 			minw = ui_litem_min_width(itemw);
 
 			if (w - lastw > 0)
-				neww = ui_item_fit(itemw, x, totw, w - lastw, !item->next, litem->alignment);
+				neww = ui_item_fit(itemw, x, totw, w - lastw, !item->next, litem->alignment, &extra_pixel);
 			else
 				neww = 0;  /* no space left, all will need clamping to minimum size */
 
@@ -2166,6 +2194,7 @@ static void ui_litem_layout_row(uiLayout *litem)
 
 	freex = 0;
 	fixedx = 0;
+	extra_pixel = 0.0f;
 	x = litem->x;
 
 	for (item = litem->items.first; item; item = item->next) {
@@ -2177,13 +2206,14 @@ static void ui_litem_layout_row(uiLayout *litem)
 			if (item->type != ITEM_BUTTON && item->flag & UI_ITEM_MIN) {
 				minw = itemw;
 			}
-			itemw = ui_item_fit(minw, fixedx, fixedw, min_ii(w, fixedw), !item->next, litem->alignment);
+			itemw = ui_item_fit(minw, fixedx, fixedw, min_ii(w, fixedw), !item->next, litem->alignment, &extra_pixel);
 			fixedx += itemw;
 		}
 		else {
 			/* free size item */
-			itemw = ui_item_fit(itemw, freex, freew, w - fixedw, !item->next, litem->alignment);
+			itemw = ui_item_fit(itemw, freex, freew, w - fixedw, !item->next, litem->alignment, &extra_pixel);
 			freex += itemw;
+			last_free_item = item;
 		}
 
 		/* align right/center */
@@ -2205,6 +2235,16 @@ static void ui_litem_layout_row(uiLayout *litem)
 			x += litem->space;
 	}
 
+	/* add extra pixel */
+	uiItem *last_item = litem->items.last;
+	extra_pixel = litem->w - (x - litem->x);
+	if (extra_pixel > 0 && litem->alignment == UI_LAYOUT_ALIGN_EXPAND && 
+			last_free_item && last_item && last_item->flag & UI_ITEM_FIXED) {
+		ui_item_move(last_free_item, 0, extra_pixel);
+		for (item = last_free_item->next; item; item = item->next)
+			ui_item_move(item, extra_pixel, extra_pixel);
+	}
+
 	litem->w = x - litem->x;
 	litem->h = litem->y - y;
 	litem->x = x;
@@ -2216,7 +2256,6 @@ static void ui_litem_estimate_column(uiLayout *litem)
 {
 	uiItem *item;
 	int itemw, itemh;
-	bool min_size_flag = true;
 
 	litem->w = 0;
 	litem->h = 0;
@@ -2224,18 +2263,12 @@ static void ui_litem_estimate_column(uiLayout *litem)
 	for (item = litem->items.first; item; item = item->next) {
 		ui_item_size(item, &itemw, &itemh);
 
-		min_size_flag = min_size_flag && (item->flag & UI_ITEM_MIN);
-
 		litem->w = MAX2(litem->w, itemw);
 		litem->h += itemh;
 
 		if (item->next)
 			litem->h += litem->space;
 	}
-
-	if (min_size_flag) {
-		litem->item.flag |= UI_ITEM_MIN;
-	}
 }
 
 static void ui_litem_layout_column(uiLayout *litem)
@@ -2648,13 +2681,14 @@ static void ui_litem_layout_absolute(uiLayout *litem)
 static void ui_litem_estimate_split(uiLayout *litem)
 {
 	ui_litem_estimate_row(litem);
+	litem->item.flag &= ~UI_ITEM_MIN;
 }
 
 static void ui_litem_layout_split(uiLayout *litem)
 {
 	uiLayoutItemSplit *split = (uiLayoutItemSplit *)litem;
 	uiItem *item;
-	float percentage;
+	float percentage, extra_pixel = 0.0f;
 	const int tot = BLI_listbase_count(&litem->items);
 	int itemh, x, y, w, colw = 0;
 
@@ -2677,7 +2711,9 @@ static void ui_litem_layout_split(uiLayout *litem)
 		x += colw;
 
 		if (item->next) {
-			colw = (w - (int)(w * percentage)) / (tot - 1);
+			const float width = extra_pixel + (w - (int)(w * percentage)) / ((float)tot - 1);
+			extra_pixel = width - (int)width;
+			colw = (int)width;
 			colw = MAX2(colw, 0);
 
 			x += litem->space;
@@ -3134,8 +3170,6 @@ static void ui_item_align(uiLayout *litem, short nr)
 		else if (item->type == ITEM_LAYOUT_BOX) {
 			box = (uiLayoutItemBx *)item;
 			box->roundbox->alignnr = nr;
-			BLI_remlink(&litem->root->block->buttons, box->roundbox);
-			BLI_addhead(&litem->root->block->buttons, box->roundbox);
 		}
 		else if (((uiLayout *)item)->align) {
 			ui_item_align((uiLayout *)item, nr);
diff --git a/source/blender/editors/interface/interface_templates.c b/source/blender/editors/interface/interface_templates.c
index 4db1c845c23..62f12cd7967 100644
--- a/source/blender/editors/interface/interface_templates.c
+++ b/source/blender/editors/interface/interface_templates.c
@@ -430,7 +430,7 @@ static void template_ID(
 		uiLayoutRow(layout, true);
 	}
 	else if (flag & UI_ID_BROWSE) {
-		but = uiDefBlockButN(block, id_search_menu, MEM_dupallocN(template), "", 0, 0, UI_UNIT_X * 1.6, UI_UNIT_Y,
+		but = uiDefBlockButN(block, id_search_menu, MEM_dupallocN(template), "", 0, 0, UI_UNIT_X * 1.5, UI_UNIT_Y,
 		                     TIP_(template_id_browse_tip(type)));
 		ui_def_but_icon(but, RNA_struct_ui_icon(type), UI_HAS_ICON);
 		/* default dragging of icon for id browse buttons */
@@ -1978,6 +1978,7 @@ static void curvemap_tools_dofunc(bContext *C, void *cumap_v, int event)
 		case UICURVE_FUNC_HANDLE_AUTO_ANIM: /* set auto-clamped */
 			curvemap_handle_set(cuma, HD_AUTO_ANIM);
 			curvemapping_changed(cumap, false);
+			break;
 		case UICURVE_FUNC_EXTEND_HOZ: /* extend horiz */
 			cuma->flag &= ~CUMA_EXTEND_EXTRAPOLATE;
 			curvemapping_changed(cumap, false);
diff --git a/source/blender/editors/interface/interface_widgets.c b/source/blender/editors/interface/interface_widgets.c
index b3736a71e74..6e871b8ec92 100644
--- a/source/blender/editors/interface/interface_widgets.c
+++ b/source/blender/editors/interface/interface_widgets.c
@@ -873,15 +873,15 @@ static void widget_draw_icon(
 	if (icon && icon != ICON_BLANK1) {
 		float ofs = 1.0f / aspect;
 		
-		if (but->drawflag & UI_BUT_ICON_LEFT) {
+		if (but->drawflag & UI_BUT_ICON_LEFT || ui_block_is_pie_menu(but->block)) {
 			if (but->block->flag & UI_BLOCK_LOOP) {
 				if (but->type == UI_BTYPE_SEARCH_MENU)
 					xs = rect->xmin + 4.0f * ofs;
 				else
-					xs = rect->xmin + ofs;
+					xs = rect->xmin + 2.0f * ofs;
 			}
 			else {
-				xs = rect->xmin + 4.0f * ofs;
+				xs = rect->xmin + 2.0f * ofs;
 			}
 			ys = (rect->ymin + rect->ymax - height) / 2.0f;
 		}
@@ -1554,11 +1554,11 @@ static void widget_draw_text_icon(uiFontStyle *fstyle, uiWidgetColors *wcol, uiB
 	/* Icons on the left with optional text label on the right */
 	else if (but->flag & UI_HAS_ICON || show_menu_icon) {
 		const BIFIconID icon = (but->flag & UI_HAS_ICON) ? but->icon + but->iconadd : ICON_NONE;
-		const float icon_size = ICON_SIZE_FROM_BUTRECT(rect);
+		const float icon_size = ICON_DEFAULT_WIDTH;
 
 		/* menu item - add some more padding so menus don't feel cramped. it must
 		 * be part of the button so that this area is still clickable */
-		if (ui_block_is_menu(but->block))
+		if (ui_block_is_menu(but->block) && !ui_block_is_pie_menu(but->block))
 			rect->xmin += 0.3f * U.widget_unit;
 
 		widget_draw_icon(but, icon, alpha, rect, show_menu_icon);
diff --git a/source/blender/editors/mesh/editmesh_intersect.c b/source/blender/editors/mesh/editmesh_intersect.c
index de93211bec4..bc9088401db 100644
--- a/source/blender/editors/mesh/editmesh_intersect.c
+++ b/source/blender/editors/mesh/editmesh_intersect.c
@@ -137,6 +137,12 @@ enum {
 	ISECT_SEL_UNSEL     = 1,
 };
 
+enum {
+	ISECT_SEPARATE_ALL           = 0,
+	ISECT_SEPARATE_CUT           = 1,
+	ISECT_SEPARATE_NONE          = 2,
+};
+
 static int edbm_intersect_exec(bContext *C, wmOperator *op)
 {
 	Object *obedit = CTX_data_edit_object(C);
@@ -144,7 +150,9 @@ static int edbm_intersect_exec(bContext *C, wmOperator *op)
 	BMesh *bm = em->bm;
 	const int mode = RNA_enum_get(op->ptr, "mode");
 	int (*test_fn)(BMFace *, void *);
-	bool use_separate = RNA_boolean_get(op->ptr, "use_separate");
+	bool use_separate_all = false;
+	bool use_separate_cut = false;
+	const int separate_mode = RNA_enum_get(op->ptr, "separate_mode");
 	const float eps = RNA_float_get(op->ptr, "threshold");
 	bool use_self;
 	bool has_isect;
@@ -160,15 +168,42 @@ static int edbm_intersect_exec(bContext *C, wmOperator *op)
 			break;
 	}
 
+	switch (separate_mode) {
+		case ISECT_SEPARATE_ALL:
+			use_separate_all = true;
+			break;
+		case ISECT_SEPARATE_CUT:
+			if (use_self == false) {
+				use_separate_cut = true;
+			}
+			else {
+				/* we could support this but would require more advanced logic inside 'BM_mesh_intersect'
+				 * for now just separate all */
+				use_separate_all = true;
+			}
+			break;
+		default:  /* ISECT_SEPARATE_NONE */
+			break;
+	}
 
 	has_isect = BM_mesh_intersect(
 	        bm,
 	        em->looptris, em->tottri,
 	        test_fn, NULL,
-	        use_self, use_separate, true, true,
+	        use_self, use_separate_all, true, true, true,
 	        -1,
 	        eps);
 
+	if (use_separate_cut) {
+		/* detach selected/un-selected faces */
+		BMOperator bmop;
+		EDBM_op_init(em, &bmop, op, "split geom=%hf use_only_faces=%b", BM_ELEM_SELECT, true);
+		BMO_op_exec(em->bm, &bmop);
+		if (!EDBM_op_finish(em, &bmop, op, true)) {
+			/* should never happen! */
+			BKE_report(op->reports, RPT_ERROR, "Error separating");
+		}
+	}
 
 	if (has_isect) {
 		edbm_intersect_select(em);
@@ -190,6 +225,16 @@ void MESH_OT_intersect(struct wmOperatorType *ot)
 		{0, NULL, 0, NULL, NULL}
 	};
 
+	static EnumPropertyItem isect_separate_items[] = {
+		{ISECT_SEPARATE_ALL, "ALL", 0, "All",
+		 "Separate all geometry from intersections"},
+		{ISECT_SEPARATE_CUT, "CUT", 0, "Cut",
+		 "Cut into geometry keeping each side separate (Selected/Unselected only)"},
+		{ISECT_SEPARATE_NONE, "NONE", 0, "Merge",
+		 "Merge all geometry from the intersection"},
+		{0, NULL, 0, NULL, NULL}
+	};
+
 	/* identifiers */
 	ot->name = "Intersect (Knife)";
 	ot->description = "Cut an intersection into faces";
@@ -201,7 +246,7 @@ void MESH_OT_intersect(struct wmOperatorType *ot)
 
 	/* props */
 	RNA_def_enum(ot->srna, "mode", isect_mode_items, ISECT_SEL_UNSEL, "Source", "");
-	RNA_def_boolean(ot->srna, "use_separate", true, "Separate", "");
+	RNA_def_enum(ot->srna, "separate_mode", isect_separate_items, ISECT_SEPARATE_CUT, "Separate Mode", "");
 	RNA_def_float_distance(ot->srna, "threshold", 0.000001f, 0.0, 0.01, "Merge threshold", "", 0.0, 0.001);
 
 	/* flags */
@@ -239,7 +284,7 @@ static int edbm_intersect_boolean_exec(bContext *C, wmOperator *op)
 	        bm,
 	        em->looptris, em->tottri,
 	        test_fn, NULL,
-	        false, false, true, true,
+	        false, false, true, true, true,
 	        boolean_operation,
 	        eps);
 
diff --git a/source/blender/editors/metaball/mball_edit.c b/source/blender/editors/metaball/mball_edit.c
index ed5bf4a92b4..bc42717b69f 100644
--- a/source/blender/editors/metaball/mball_edit.c
+++ b/source/blender/editors/metaball/mball_edit.c
@@ -592,12 +592,9 @@ bool ED_mball_select_pick(bContext *C, const int mval[2], bool extend, bool dese
 
 	view3d_set_viewcontext(C, &vc);
 
-	rect.xmin = mval[0] - 12;
-	rect.xmax = mval[0] + 12;
-	rect.ymin = mval[1] - 12;
-	rect.ymax = mval[1] + 12;
+	BLI_rcti_init_pt_radius(&rect, mval, 12);
 
-	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true);
+	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, VIEW3D_SELECT_PICK_NEAREST);
 
 	/* does startelem exist? */
 	ml = mb->editelems->first;
diff --git a/source/blender/editors/object/object_add.c b/source/blender/editors/object/object_add.c
index 02b2d8492b4..ae458c722f9 100644
--- a/source/blender/editors/object/object_add.c
+++ b/source/blender/editors/object/object_add.c
@@ -64,6 +64,7 @@
 #include "BKE_armature.h"
 #include "BKE_camera.h"
 #include "BKE_context.h"
+#include "BKE_constraint.h"
 #include "BKE_curve.h"
 #include "BKE_depsgraph.h"
 #include "BKE_DerivedMesh.h"
@@ -1377,7 +1378,7 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 		ob->proxy = NULL;
 
 		ob->parent = NULL;
-		BLI_listbase_clear(&ob->constraints);
+		BKE_constraints_free(&ob->constraints);
 		ob->curve_cache = NULL;
 		ob->transflag &= ~OB_DUPLI;
 		ob->lay = base->lay;
diff --git a/source/blender/editors/object/object_bake_api.c b/source/blender/editors/object/object_bake_api.c
index fd95d6129ad..968081818a2 100644
--- a/source/blender/editors/object/object_bake_api.c
+++ b/source/blender/editors/object/object_bake_api.c
@@ -352,12 +352,17 @@ static bool is_noncolor_pass(ScenePassType pass_type)
 }
 
 /* if all is good tag image and return true */
-static bool bake_object_check(Object *ob, ReportList *reports)
+static bool bake_object_check(Scene *scene, Object *ob, ReportList *reports)
 {
 	Image *image;
 	void *lock;
 	int i;
 
+	if ((ob->lay & scene->lay) == 0) {
+		BKE_reportf(reports, RPT_ERROR, "Object \"%s\" is not on a scene layer", ob->id.name + 2);
+		return false;
+	}
+
 	if (ob->type != OB_MESH) {
 		BKE_reportf(reports, RPT_ERROR, "Object \"%s\" is not a mesh", ob->id.name + 2);
 		return false;
@@ -491,7 +496,7 @@ static bool bake_pass_filter_check(ScenePassType pass_type, const int pass_filte
 }
 
 /* before even getting in the bake function we check for some basic errors */
-static bool bake_objects_check(Main *bmain, Object *ob, ListBase *selected_objects,
+static bool bake_objects_check(Main *bmain, Scene *scene, Object *ob, ListBase *selected_objects,
                                ReportList *reports, const bool is_selected_to_active)
 {
 	CollectionPointerLink *link;
@@ -502,7 +507,7 @@ static bool bake_objects_check(Main *bmain, Object *ob, ListBase *selected_objec
 	if (is_selected_to_active) {
 		int tot_objects = 0;
 
-		if (!bake_object_check(ob, reports))
+		if (!bake_object_check(scene, ob, reports))
 			return false;
 
 		for (link = selected_objects->first; link; link = link->next) {
@@ -530,7 +535,7 @@ static bool bake_objects_check(Main *bmain, Object *ob, ListBase *selected_objec
 		}
 
 		for (link = selected_objects->first; link; link = link->next) {
-			if (!bake_object_check(link->ptr.data, reports))
+			if (!bake_object_check(scene, link->ptr.data, reports))
 				return false;
 		}
 	}
@@ -619,7 +624,7 @@ static Mesh *bake_mesh_new_from_object(Main *bmain, Scene *scene, Object *ob)
 		ED_object_editmode_load(ob);
 
 	Mesh *me = BKE_mesh_new_from_object(bmain, scene, ob, 1, 2, 0, 0);
-	BKE_mesh_split_faces(me);
+	BKE_mesh_split_faces(me, true);
 
 	return me;
 }
@@ -1179,7 +1184,7 @@ static int bake_exec(bContext *C, wmOperator *op)
 		goto finally;
 	}
 
-	if (!bake_objects_check(bkr.main, bkr.ob, &bkr.selected_objects, bkr.reports, bkr.is_selected_to_active)) {
+	if (!bake_objects_check(bkr.main, bkr.scene, bkr.ob, &bkr.selected_objects, bkr.reports, bkr.is_selected_to_active)) {
 		goto finally;
 	}
 
@@ -1237,7 +1242,7 @@ static void bake_startjob(void *bkv, short *UNUSED(stop), short *do_update, floa
 		return;
 	}
 
-	if (!bake_objects_check(bkr->main, bkr->ob, &bkr->selected_objects, bkr->reports, bkr->is_selected_to_active)) {
+	if (!bake_objects_check(bkr->main, bkr->scene, bkr->ob, &bkr->selected_objects, bkr->reports, bkr->is_selected_to_active)) {
 		bkr->result = OPERATOR_CANCELLED;
 		return;
 	}
diff --git a/source/blender/editors/object/object_intern.h b/source/blender/editors/object/object_intern.h
index 9710e4f843d..b8957bdedf9 100644
--- a/source/blender/editors/object/object_intern.h
+++ b/source/blender/editors/object/object_intern.h
@@ -186,6 +186,7 @@ void OBJECT_OT_skin_loose_mark_clear(struct wmOperatorType *ot);
 void OBJECT_OT_skin_radii_equalize(struct wmOperatorType *ot);
 void OBJECT_OT_skin_armature_create(struct wmOperatorType *ot);
 void OBJECT_OT_laplaciandeform_bind(struct wmOperatorType *ot);
+void OBJECT_OT_surfacedeform_bind(struct wmOperatorType *ot);
 
 /* object_constraint.c */
 void OBJECT_OT_constraint_add(struct wmOperatorType *ot);
diff --git a/source/blender/editors/object/object_modifier.c b/source/blender/editors/object/object_modifier.c
index 06f495fb9f1..d601f5c3b14 100644
--- a/source/blender/editors/object/object_modifier.c
+++ b/source/blender/editors/object/object_modifier.c
@@ -2294,3 +2294,56 @@ void OBJECT_OT_laplaciandeform_bind(wmOperatorType *ot)
 	ot->flag = OPTYPE_REGISTER | OPTYPE_UNDO | OPTYPE_INTERNAL;
 	edit_modifier_properties(ot);
 }
+
+/************************ sdef bind operator *********************/
+
+static int surfacedeform_bind_poll(bContext *C)
+{
+	return edit_modifier_poll_generic(C, &RNA_SurfaceDeformModifier, 0);
+}
+
+static int surfacedeform_bind_exec(bContext *C, wmOperator *op)
+{
+	Object *ob = ED_object_active_context(C);
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)edit_modifier_property_get(op, ob, eModifierType_SurfaceDeform);
+
+	if (!smd)
+		return OPERATOR_CANCELLED;
+
+	if (smd->flags & MOD_SDEF_BIND) {
+		smd->flags &= ~MOD_SDEF_BIND;
+	}
+	else if (smd->target) {
+		smd->flags |= MOD_SDEF_BIND;
+	}
+
+	DAG_id_tag_update(&ob->id, OB_RECALC_DATA);
+	WM_event_add_notifier(C, NC_OBJECT | ND_MODIFIER, ob);
+
+	return OPERATOR_FINISHED;
+}
+
+static int surfacedeform_bind_invoke(bContext *C, wmOperator *op, const wmEvent *UNUSED(event))
+{
+	if (edit_modifier_invoke_properties(C, op))
+		return surfacedeform_bind_exec(C, op);
+	else
+		return OPERATOR_CANCELLED;
+}
+
+void OBJECT_OT_surfacedeform_bind(wmOperatorType *ot)
+{
+	/* identifiers */
+	ot->name = "Surface Deform Bind";
+	ot->description = "Bind mesh to target in surface deform modifier";
+	ot->idname = "OBJECT_OT_surfacedeform_bind";
+
+	/* api callbacks */
+	ot->poll = surfacedeform_bind_poll;
+	ot->invoke = surfacedeform_bind_invoke;
+	ot->exec = surfacedeform_bind_exec;
+
+	/* flags */
+	ot->flag = OPTYPE_REGISTER | OPTYPE_UNDO | OPTYPE_INTERNAL;
+	edit_modifier_properties(ot);
+}
diff --git a/source/blender/editors/object/object_ops.c b/source/blender/editors/object/object_ops.c
index 7e7e1ef182c..5fe5a884354 100644
--- a/source/blender/editors/object/object_ops.c
+++ b/source/blender/editors/object/object_ops.c
@@ -255,6 +255,7 @@ void ED_operatortypes_object(void)
 
 	WM_operatortype_append(OBJECT_OT_data_transfer);
 	WM_operatortype_append(OBJECT_OT_datalayout_transfer);
+	WM_operatortype_append(OBJECT_OT_surfacedeform_bind);
 }
 
 void ED_operatormacros_object(void)
diff --git a/source/blender/editors/physics/physics_ops.c b/source/blender/editors/physics/physics_ops.c
index 0c907f19753..b1d708ebc07 100644
--- a/source/blender/editors/physics/physics_ops.c
+++ b/source/blender/editors/physics/physics_ops.c
@@ -138,13 +138,21 @@ static void keymap_particle(wmKeyConfig *keyconf)
 	RNA_boolean_set(kmi->ptr, "unselected", true);
 
 	/* Shift+LMB behavior first, so it has priority over KM_ANY item below. */
-	kmi = WM_keymap_verify_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
+	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
 	RNA_boolean_set(kmi->ptr, "release_confirm", true);
 	RNA_boolean_set(kmi->ptr, "use_planar_constraint", true);
+	RNA_boolean_set(kmi->ptr, "use_accurate", false);
+
+	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
+	RNA_boolean_set(kmi->ptr, "release_confirm", true);
+	RNA_boolean_set(kmi->ptr, "use_planar_constraint", false);
+	RNA_boolean_set(kmi->ptr, "use_accurate", true);
+
 	/* Using KM_ANY here to allow holding modifiers before starting to transform. */
 	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_ANY, 0);
 	RNA_boolean_set(kmi->ptr, "release_confirm", true);
 	RNA_boolean_set(kmi->ptr, "use_planar_constraint", false);
+	RNA_boolean_set(kmi->ptr, "use_accurate", false);
 
 	WM_keymap_add_item(keymap, "PARTICLE_OT_brush_edit", LEFTMOUSE, KM_PRESS, 0, 0);
 	WM_keymap_add_item(keymap, "PARTICLE_OT_brush_edit", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
diff --git a/source/blender/editors/render/render_opengl.c b/source/blender/editors/render/render_opengl.c
index 9097432a251..1d0f433ba38 100644
--- a/source/blender/editors/render/render_opengl.c
+++ b/source/blender/editors/render/render_opengl.c
@@ -315,7 +315,7 @@ static void screen_opengl_render_doit(OGLRender *oglrender, RenderResult *rr)
 			RE_render_result_rect_from_ibuf(rr, &scene->r, out, oglrender->view_id);
 			IMB_freeImBuf(out);
 		}
-		else if (gpd){
+		else if (gpd) {
 			/* If there are no strips, Grease Pencil still needs a buffer to draw on */
 			ImBuf *out = IMB_allocImBuf(oglrender->sizex, oglrender->sizey, 32, IB_rect);
 			RE_render_result_rect_from_ibuf(rr, &scene->r, out, oglrender->view_id);
@@ -715,7 +715,6 @@ static bool screen_opengl_render_init(bContext *C, wmOperator *op)
 			oglrender->task_scheduler = task_scheduler;
 			oglrender->task_pool = BLI_task_pool_create_background(task_scheduler,
 			                                                       oglrender);
-			BLI_pool_set_num_threads(oglrender->task_pool, 1);
 		}
 		else {
 			oglrender->task_scheduler = NULL;
@@ -747,6 +746,23 @@ static void screen_opengl_render_end(bContext *C, OGLRender *oglrender)
 	int i;
 
 	if (oglrender->is_animation) {
+		/* Trickery part for movie output:
+		 *
+		 * We MUST write frames in an exact order, so we only let background
+		 * thread to work on that, and main thread is simply waits for that
+		 * thread to do all the dirty work.
+		 *
+		 * After this loop is done work_and_wait() will have nothing to do,
+		 * so we don't run into wrong order of frames written to the stream.
+		 */
+		if (BKE_imtype_is_movie(scene->r.im_format.imtype)) {
+			BLI_mutex_lock(&oglrender->task_mutex);
+			while (oglrender->num_scheduled_frames > 0) {
+				BLI_condition_wait(&oglrender->task_condition,
+				                   &oglrender->task_mutex);
+			}
+			BLI_mutex_unlock(&oglrender->task_mutex);
+		}
 		BLI_task_pool_work_and_wait(oglrender->task_pool);
 		BLI_task_pool_free(oglrender->task_pool);
 		/* Depending on various things we might or might not use global scheduler. */
@@ -886,14 +902,15 @@ static void write_result_func(TaskPool * __restrict pool,
 	 */
 	ReportList reports;
 	BKE_reports_init(&reports, oglrender->reports->flag & ~RPT_PRINT);
-	/* Do actual save logic here, depending on the file format. */
+	/* Do actual save logic here, depending on the file format.
+	 *
+	 * NOTE: We have to construct temporary scene with proper scene->r.cfra.
+	 * This is because underlying calls do not use r.cfra but use scene
+	 * for that.
+	 */
+	Scene tmp_scene = *scene;
+	tmp_scene.r.cfra = cfra;
 	if (is_movie) {
-		/* We have to construct temporary scene with proper scene->r.cfra.
-		 * This is because underlying calls do not use r.cfra but use scene
-		 * for that.
-		 */
-		Scene tmp_scene = *scene;
-		tmp_scene.r.cfra = cfra;
 		ok = RE_WriteRenderViewsMovie(&reports,
 		                              rr,
 		                              &tmp_scene,
@@ -917,8 +934,8 @@ static void write_result_func(TaskPool * __restrict pool,
 		                             true,
 		                             NULL);
 
-		BKE_render_result_stamp_info(scene, scene->camera, rr, false);
-		ok = RE_WriteRenderViewsImage(NULL, rr, scene, true, name);
+		BKE_render_result_stamp_info(&tmp_scene, tmp_scene.camera, rr, false);
+		ok = RE_WriteRenderViewsImage(NULL, rr, &tmp_scene, true, name);
 		if (!ok) {
 			BKE_reportf(&reports,
 			            RPT_ERROR,
diff --git a/source/blender/editors/sculpt_paint/paint_image_proj.c b/source/blender/editors/sculpt_paint/paint_image_proj.c
index f5d115442c6..d0f1cc99b8d 100644
--- a/source/blender/editors/sculpt_paint/paint_image_proj.c
+++ b/source/blender/editors/sculpt_paint/paint_image_proj.c
@@ -5711,21 +5711,16 @@ static bool proj_paint_add_slot(bContext *C, wmOperator *op)
 			/* successful creation of mtex layer, now create set */
 			if (mtex) {
 				int type = MAP_COL;
-				int type_id = 0;
+				char imagename_buff[MAX_ID_NAME - 2];
+				const char *imagename = DATA_("Diffuse Color");
 
 				if (op) {
-					int i;
 					type = RNA_enum_get(op->ptr, "type");
-
-					for (i = 0; i < ARRAY_SIZE(layer_type_items); i++) {
-						if (layer_type_items[i].value == type) {
-							type_id = i;
-							break;
-						}
-					}
+					RNA_string_get(op->ptr, "name", imagename_buff);
+					imagename = imagename_buff;
 				}
 
-				mtex->tex = BKE_texture_add(bmain, DATA_(layer_type_items[type_id].name));
+				mtex->tex = BKE_texture_add(bmain, imagename);
 				mtex->mapto = type;
 
 				if (mtex->tex) {
diff --git a/source/blender/editors/sculpt_paint/sculpt.c b/source/blender/editors/sculpt_paint/sculpt.c
index 84e98181dfb..44cc2720a32 100644
--- a/source/blender/editors/sculpt_paint/sculpt.c
+++ b/source/blender/editors/sculpt_paint/sculpt.c
@@ -5361,8 +5361,12 @@ static int sculpt_mode_toggle_exec(bContext *C, wmOperator *op)
 		if (mmd)
 			multires_force_update(ob);
 
-		if (flush_recalc || (ob->sculpt && ob->sculpt->bm))
+		/* Always for now, so leaving sculpt mode always ensures scene is in
+		 * a consistent state.
+		 */
+		if (true || flush_recalc || (ob->sculpt && ob->sculpt->bm)) {
 			DAG_id_tag_update(&ob->id, OB_RECALC_DATA);
+		}
 
 		if (me->flag & ME_SCULPT_DYNAMIC_TOPOLOGY) {
 			/* Dynamic topology must be disabled before exiting sculpt
diff --git a/source/blender/editors/space_clip/tracking_ops.c b/source/blender/editors/space_clip/tracking_ops.c
index dc3aa3a5f48..8b9f515995a 100644
--- a/source/blender/editors/space_clip/tracking_ops.c
+++ b/source/blender/editors/space_clip/tracking_ops.c
@@ -1537,7 +1537,8 @@ static int join_tracks_exec(bContext *C, wmOperator *op)
 				update_stabilization = true;
 				if ((act_track->flag & TRACK_USE_2D_STAB) == 0) {
 					act_track->flag |= TRACK_USE_2D_STAB;
-				} else {
+				}
+				else {
 					stab->tot_track--;
 				}
 				BLI_assert(0 <= stab->tot_track);
@@ -1546,7 +1547,8 @@ static int join_tracks_exec(bContext *C, wmOperator *op)
 				update_stabilization = true;
 				if ((act_track->flag & TRACK_USE_2D_STAB_ROT) == 0) {
 					act_track->flag |= TRACK_USE_2D_STAB_ROT;
-				} else {
+				}
+				else {
 					stab->tot_rot_track--;
 				}
 				BLI_assert(0 <= stab->tot_rot_track);
diff --git a/source/blender/editors/space_nla/nla_draw.c b/source/blender/editors/space_nla/nla_draw.c
index 5b3c062e16d..93dcdbb5c02 100644
--- a/source/blender/editors/space_nla/nla_draw.c
+++ b/source/blender/editors/space_nla/nla_draw.c
@@ -290,7 +290,8 @@ static void nla_draw_strip_curves(NlaStrip *strip, float yminc, float ymaxc)
 		 *	- min y-val is yminc, max is y-maxc, so clamp in those regions
 		 */
 		for (cfra = strip->start; cfra <= strip->end; cfra += 1.0f) {
-			float y = evaluate_fcurve(fcu, cfra);    // assume this to be in 0-1 range
+			float y = evaluate_fcurve(fcu, cfra);
+			CLAMP(y, 0.0f, 1.0f);
 			glVertex2f(cfra, ((y * yheight) + yminc));
 		}
 		glEnd(); // GL_LINE_STRIP
diff --git a/source/blender/editors/space_node/node_edit.c b/source/blender/editors/space_node/node_edit.c
index ffe510016ff..fdfe316f5ed 100644
--- a/source/blender/editors/space_node/node_edit.c
+++ b/source/blender/editors/space_node/node_edit.c
@@ -582,7 +582,7 @@ void snode_set_context(const bContext *C)
 		}
 	}
 	
-	if (snode->nodetree != ntree || snode->id != id || snode->from != from) {
+	if (snode->nodetree != ntree || snode->id != id || snode->from != from || snode->treepath.last == NULL) {
 		ED_node_tree_start(snode, ntree, id, from);
 	}
 	
@@ -1069,12 +1069,9 @@ int node_find_indicated_socket(SpaceNode *snode, bNode **nodep, bNodeSocket **so
 	
 	/* check if we click in a socket */
 	for (node = snode->edittree->nodes.first; node; node = node->next) {
-		
-		rect.xmin = cursor[0] - (NODE_SOCKSIZE + 4);
-		rect.ymin = cursor[1] - (NODE_SOCKSIZE + 4);
-		rect.xmax = cursor[0] + (NODE_SOCKSIZE + 4);
-		rect.ymax = cursor[1] + (NODE_SOCKSIZE + 4);
-		
+
+		BLI_rctf_init_pt_radius(&rect, cursor, NODE_SOCKSIZE + 4);
+
 		if (!(node->flag & NODE_HIDDEN)) {
 			/* extra padding inside and out - allow dragging on the text areas too */
 			if (in_out == SOCK_IN) {
diff --git a/source/blender/editors/space_outliner/outliner_draw.c b/source/blender/editors/space_outliner/outliner_draw.c
index 99242fd12f9..684a1f9fd67 100644
--- a/source/blender/editors/space_outliner/outliner_draw.c
+++ b/source/blender/editors/space_outliner/outliner_draw.c
@@ -1126,6 +1126,7 @@ static void tselem_draw_icon(uiBlock *block, int xmax, float x, float y, TreeSto
 					case eModifierType_Cast:
 						UI_icon_draw(x, y, ICON_MOD_CAST); break;
 					case eModifierType_MeshDeform:
+					case eModifierType_SurfaceDeform:
 						UI_icon_draw(x, y, ICON_MOD_MESHDEFORM); break;
 					case eModifierType_Bevel:
 						UI_icon_draw(x, y, ICON_MOD_BEVEL); break;
diff --git a/source/blender/editors/space_sequencer/sequencer_draw.c b/source/blender/editors/space_sequencer/sequencer_draw.c
index e1768e4aedc..70a6e6d83cb 100644
--- a/source/blender/editors/space_sequencer/sequencer_draw.c
+++ b/source/blender/editors/space_sequencer/sequencer_draw.c
@@ -545,7 +545,8 @@ static void draw_seq_text(View2D *v2d, SpaceSeq *sseq, Sequence *seq, float x1,
 		if ((sseq->flag & SEQ_ALL_WAVEFORMS) || (seq->flag & SEQ_AUDIO_DRAW_WAVEFORM)) {
 			str[0] = 0;
 			str_len = 0;
-		} else if (seq->sound) {
+		}
+		else if (seq->sound) {
 			str_len = BLI_snprintf(str, sizeof(str), "%s: %s | %d",
 			                       name, seq->sound->name, seq->len);
 		}
diff --git a/source/blender/editors/space_view3d/drawvolume.c b/source/blender/editors/space_view3d/drawvolume.c
index 27ecbf83db5..182dc214f8e 100644
--- a/source/blender/editors/space_view3d/drawvolume.c
+++ b/source/blender/editors/space_view3d/drawvolume.c
@@ -774,8 +774,8 @@ void draw_smoke_velocity(SmokeDomainSettings *domain, float viewnormal[3])
 
 	float min[3] = {
 	    domain->p0[0] - domain->cell_size[0] * domain->adapt_res,
-		domain->p0[1] - domain->cell_size[1] * domain->adapt_res,
-		domain->p0[2] - domain->cell_size[2] * domain->adapt_res,
+	    domain->p0[1] - domain->cell_size[1] * domain->adapt_res,
+	    domain->p0[2] - domain->cell_size[2] * domain->adapt_res,
 	};
 
 	int num_points_v[3] = {
diff --git a/source/blender/editors/space_view3d/space_view3d.c b/source/blender/editors/space_view3d/space_view3d.c
index 964f4bcdd9c..b8228c63209 100644
--- a/source/blender/editors/space_view3d/space_view3d.c
+++ b/source/blender/editors/space_view3d/space_view3d.c
@@ -180,8 +180,8 @@ bool ED_view3d_context_user_region(bContext *C, View3D **r_v3d, ARegion **r_ar)
 		View3D *v3d = (View3D *)sa->spacedata.first;
 
 		if (ar) {
-			RegionView3D *rv3d = ar->regiondata;
-			if (rv3d && (rv3d->viewlock & RV3D_LOCKED) == 0) {
+			RegionView3D *rv3d;
+			if ((ar->regiontype == RGN_TYPE_WINDOW) && (rv3d = ar->regiondata) && (rv3d->viewlock & RV3D_LOCKED) == 0) {
 				*r_v3d = v3d;
 				*r_ar = ar;
 				return true;
@@ -869,6 +869,7 @@ static void view3d_main_region_listener(bScreen *sc, ScrArea *sa, ARegion *ar, w
 				case ND_CONSTRAINT:
 				case ND_KEYS:
 				case ND_PARTICLE:
+				case ND_POINTCACHE:
 				case ND_LOD:
 					ED_region_tag_redraw(ar);
 					break;
diff --git a/source/blender/editors/space_view3d/view3d_draw.c b/source/blender/editors/space_view3d/view3d_draw.c
index f23e587e55d..0c5cf1bd936 100644
--- a/source/blender/editors/space_view3d/view3d_draw.c
+++ b/source/blender/editors/space_view3d/view3d_draw.c
@@ -2955,7 +2955,7 @@ struct RV3DMatrixStore {
 	float pixsize;
 };
 
-void *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d)
+struct RV3DMatrixStore *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d)
 {
 	struct RV3DMatrixStore *rv3dmat = MEM_mallocN(sizeof(*rv3dmat), __func__);
 	copy_m4_m4(rv3dmat->winmat, rv3d->winmat);
@@ -2968,9 +2968,8 @@ void *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d)
 	return (void *)rv3dmat;
 }
 
-void ED_view3d_mats_rv3d_restore(struct RegionView3D *rv3d, void *rv3dmat_pt)
+void ED_view3d_mats_rv3d_restore(struct RegionView3D *rv3d, struct RV3DMatrixStore *rv3dmat)
 {
-	struct RV3DMatrixStore *rv3dmat = rv3dmat_pt;
 	copy_m4_m4(rv3d->winmat, rv3dmat->winmat);
 	copy_m4_m4(rv3d->viewmat, rv3dmat->viewmat);
 	copy_m4_m4(rv3d->persmat, rv3dmat->persmat);
diff --git a/source/blender/editors/space_view3d/view3d_edit.c b/source/blender/editors/space_view3d/view3d_edit.c
index 2b53eb71d99..f07727f8118 100644
--- a/source/blender/editors/space_view3d/view3d_edit.c
+++ b/source/blender/editors/space_view3d/view3d_edit.c
@@ -90,19 +90,6 @@ bool ED_view3d_offset_lock_check(const  View3D *v3d, const  RegionView3D *rv3d)
 	return (rv3d->persp != RV3D_CAMOB) && (v3d->ob_centre_cursor || v3d->ob_centre);
 }
 
-static bool view3d_operator_offset_lock_check(bContext *C, wmOperator *op)
-{
-	View3D *v3d = CTX_wm_view3d(C);
-	RegionView3D *rv3d = CTX_wm_region_view3d(C);
-	if (ED_view3d_offset_lock_check(v3d, rv3d)) {
-		BKE_report(op->reports, RPT_WARNING, "View offset is locked");
-		return true;
-	}
-	else {
-		return false;
-	}
-}
-
 /* ********************** view3d_edit: view manipulations ********************* */
 
 /**
@@ -2596,6 +2583,19 @@ void VIEW3D_OT_zoom(wmOperatorType *ot)
 
 
 /* ************************ viewdolly ******************************** */
+static bool viewdolly_offset_lock_check(bContext *C, wmOperator *op)
+{
+	View3D *v3d = CTX_wm_view3d(C);
+	RegionView3D *rv3d = CTX_wm_region_view3d(C);
+	if (ED_view3d_offset_lock_check(v3d, rv3d)) {
+		BKE_report(op->reports, RPT_WARNING, "Cannot dolly when the view offset is locked");
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+
 static void view_dolly_mouseloc(ARegion *ar, float orig_ofs[3], float dvec[3], float dfac)
 {
 	RegionView3D *rv3d = ar->regiondata;
@@ -2746,7 +2746,7 @@ static int viewdolly_invoke(bContext *C, wmOperator *op, const wmEvent *event)
 {
 	ViewOpsData *vod;
 
-	if (view3d_operator_offset_lock_check(C, op))
+	if (viewdolly_offset_lock_check(C, op))
 		return OPERATOR_CANCELLED;
 
 	/* makes op->customdata */
@@ -4364,41 +4364,24 @@ static EnumPropertyItem prop_view_pan_items[] = {
 	{0, NULL, 0, NULL, NULL}
 };
 
-static int viewpan_exec(bContext *C, wmOperator *op)
+static int viewpan_invoke(bContext *C, wmOperator *op, const wmEvent *event)
 {
-	ScrArea *sa = CTX_wm_area(C);
-	ARegion *ar = CTX_wm_region(C);
-	View3D *v3d = CTX_wm_view3d(C);
-	RegionView3D *rv3d = CTX_wm_region_view3d(C);
-	float vec[3];
-	const float co_zero[3] = {0.0f};
-	float mval_f[2] = {0.0f, 0.0f};
-	float zfac;
-	int pandir;
+	int x = 0, y = 0;
+	int pandir = RNA_enum_get(op->ptr, "type");
 
-	if (view3d_operator_offset_lock_check(C, op))
-		return OPERATOR_CANCELLED;
+	if      (pandir == V3D_VIEW_PANRIGHT)  { x = -32; }
+	else if (pandir == V3D_VIEW_PANLEFT)   { x =  32; }
+	else if (pandir == V3D_VIEW_PANUP)     { y = -25; }
+	else if (pandir == V3D_VIEW_PANDOWN)   { y =  25; }
 
-	pandir = RNA_enum_get(op->ptr, "type");
-
-	ED_view3d_camera_lock_init(v3d, rv3d);
-
-	zfac = ED_view3d_calc_zfac(rv3d, co_zero, NULL);
-	if      (pandir == V3D_VIEW_PANRIGHT)  { mval_f[0] = -32.0f; }
-	else if (pandir == V3D_VIEW_PANLEFT)   { mval_f[0] =  32.0f; }
-	else if (pandir == V3D_VIEW_PANUP)     { mval_f[1] = -25.0f; }
-	else if (pandir == V3D_VIEW_PANDOWN)   { mval_f[1] =  25.0f; }
-	ED_view3d_win_to_delta(ar, mval_f, vec, zfac);
-	add_v3_v3(rv3d->ofs, vec);
-
-	if (rv3d->viewlock & RV3D_BOXVIEW)
-		view3d_boxview_sync(sa, ar);
-
-	ED_view3d_depth_tag_update(rv3d);
+	viewops_data_alloc(C, op);
+	viewops_data_create(C, op, event);
+	ViewOpsData *vod = op->customdata;
 
-	ED_view3d_camera_lock_sync(v3d, rv3d);
+	viewmove_apply(vod, vod->oldx + x, vod->oldy + y);
 
-	ED_region_tag_redraw(ar);
+	ED_view3d_depth_tag_update(vod->rv3d);
+	viewops_data_free(C, op);
 
 	return OPERATOR_FINISHED;
 }
@@ -4411,7 +4394,7 @@ void VIEW3D_OT_view_pan(wmOperatorType *ot)
 	ot->idname = "VIEW3D_OT_view_pan";
 
 	/* api callbacks */
-	ot->exec = viewpan_exec;
+	ot->invoke = viewpan_invoke;
 	ot->poll = ED_operator_region_view3d_active;
 
 	/* flags */
@@ -4798,6 +4781,7 @@ static int manipulator_invoke(bContext *C, wmOperator *op, const wmEvent *event)
 
 void VIEW3D_OT_manipulator(wmOperatorType *ot)
 {
+	PropertyRNA *prop;
 
 	/* identifiers */
 	ot->name = "3D Manipulator";
@@ -4812,8 +4796,9 @@ void VIEW3D_OT_manipulator(wmOperatorType *ot)
 	/* properties to pass to transform */
 	Transform_Properties(ot, P_CONSTRAINT);
 
-	RNA_def_boolean(ot->srna, "use_planar_constraint", false, "Planar Constraint", "Limit the transformation to the "
-	                "two axes that have not been clicked (translate/scale only)");
+	prop = RNA_def_boolean(ot->srna, "use_planar_constraint", false, "Planar Constraint", "Limit the transformation to the "
+	                       "two axes that have not been clicked (translate/scale only)");
+	RNA_def_property_flag(prop, PROP_SKIP_SAVE | PROP_HIDDEN);
 }
 
 static int enable_manipulator_invoke(bContext *C, wmOperator *op, const wmEvent *UNUSED(event))
@@ -4902,11 +4887,7 @@ static float view_autodist_depth_margin(ARegion *ar, const int mval[2], int marg
 		rect.ymax = mval[1] + 1;
 	}
 	else {
-		rect.xmax = mval[0] + margin;
-		rect.ymax = mval[1] + margin;
-
-		rect.xmin = mval[0] - margin;
-		rect.ymin = mval[1] - margin;
+		BLI_rcti_init_pt_radius(&rect, mval, margin);
 	}
 
 	view3d_update_depths_rect(ar, &depth_temp, &rect);
diff --git a/source/blender/editors/space_view3d/view3d_intern.h b/source/blender/editors/space_view3d/view3d_intern.h
index b11f42bcfef..87b3d95cd4e 100644
--- a/source/blender/editors/space_view3d/view3d_intern.h
+++ b/source/blender/editors/space_view3d/view3d_intern.h
@@ -241,7 +241,7 @@ void ED_view3d_smooth_view_force_finish(
         struct bContext *C,
         struct View3D *v3d, struct ARegion *ar);
 
-void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rctf *rect);
+void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rcti *rect);
 void view3d_viewmatrix_set(Scene *scene, const View3D *v3d, RegionView3D *rv3d);
 
 void fly_modal_keymap(struct wmKeyConfig *keyconf);
diff --git a/source/blender/editors/space_view3d/view3d_ops.c b/source/blender/editors/space_view3d/view3d_ops.c
index 0fa6841fe27..d71639c35d2 100644
--- a/source/blender/editors/space_view3d/view3d_ops.c
+++ b/source/blender/editors/space_view3d/view3d_ops.c
@@ -241,13 +241,21 @@ void view3d_keymap(wmKeyConfig *keyconf)
 	keymap = WM_keymap_find(keyconf, "3D View", SPACE_VIEW3D, 0);
 
 	/* Shift+LMB behavior first, so it has priority over KM_ANY item below. */
-	kmi = WM_keymap_verify_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
+	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
 	RNA_boolean_set(kmi->ptr, "release_confirm", true);
 	RNA_boolean_set(kmi->ptr, "use_planar_constraint", true);
+	RNA_boolean_set(kmi->ptr, "use_accurate", false);
+
+	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
+	RNA_boolean_set(kmi->ptr, "release_confirm", true);
+	RNA_boolean_set(kmi->ptr, "use_planar_constraint", false);
+	RNA_boolean_set(kmi->ptr, "use_accurate", true);
+
 	/* Using KM_ANY here to allow holding modifiers before starting to transform. */
 	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_ANY, 0);
 	RNA_boolean_set(kmi->ptr, "release_confirm", true);
 	RNA_boolean_set(kmi->ptr, "use_planar_constraint", false);
+	RNA_boolean_set(kmi->ptr, "use_accurate", false);
 
 	WM_keymap_verify_item(keymap, "VIEW3D_OT_cursor3d", ACTIONMOUSE, KM_PRESS, 0, 0);
 	
diff --git a/source/blender/editors/space_view3d/view3d_select.c b/source/blender/editors/space_view3d/view3d_select.c
index 3239d07553f..0c0a7df8f84 100644
--- a/source/blender/editors/space_view3d/view3d_select.c
+++ b/source/blender/editors/space_view3d/view3d_select.c
@@ -96,8 +96,12 @@
 
 #include "GPU_draw.h"
 
+#include "GPU_select.h"
+
 #include "view3d_intern.h"  /* own include */
 
+// #include "PIL_time_utildefines.h"
+
 float ED_view3d_select_dist_px(void)
 {
 	return 75.0f * U.pixelsize;
@@ -1087,7 +1091,9 @@ static void deselectall_except(Scene *scene, Base *b)   /* deselect all except b
 	}
 }
 
-static Base *object_mouse_select_menu(bContext *C, ViewContext *vc, unsigned int *buffer, int hits, const int mval[2], short toggle)
+static Base *object_mouse_select_menu(
+        bContext *C, ViewContext *vc, unsigned int *buffer, int hits,
+        const int mval[2], bool toggle)
 {
 	short baseCount = 0;
 	bool ok;
@@ -1178,19 +1184,19 @@ static bool selectbuffer_has_bones(const unsigned int *buffer, const unsigned in
 }
 
 /* utility function for mixed_bones_object_selectbuffer */
-static short selectbuffer_ret_hits_15(unsigned int *UNUSED(buffer), const short hits15)
+static int selectbuffer_ret_hits_15(unsigned int *UNUSED(buffer), const int hits15)
 {
 	return hits15;
 }
 
-static short selectbuffer_ret_hits_9(unsigned int *buffer, const short hits15, const short hits9)
+static int selectbuffer_ret_hits_9(unsigned int *buffer, const int hits15, const int hits9)
 {
 	const int offs = 4 * hits15;
 	memcpy(buffer, buffer + offs, 4 * hits9 * sizeof(unsigned int));
 	return hits9;
 }
 
-static short selectbuffer_ret_hits_5(unsigned int *buffer, const short hits15, const short hits9, const short hits5)
+static int selectbuffer_ret_hits_5(unsigned int *buffer, const int hits15, const int hits9, const int hits5)
 {
 	const int offs = 4 * hits15 + 4 * hits9;
 	memcpy(buffer, buffer + offs, 4 * hits5  * sizeof(unsigned int));
@@ -1199,14 +1205,13 @@ static short selectbuffer_ret_hits_5(unsigned int *buffer, const short hits15, c
 
 /* we want a select buffer with bones, if there are... */
 /* so check three selection levels and compare */
-static short mixed_bones_object_selectbuffer(
+static int mixed_bones_object_selectbuffer(
         ViewContext *vc, unsigned int *buffer, const int mval[2],
         bool use_cycle, bool enumerate,
         bool *r_do_nearest)
 {
 	rcti rect;
-	int offs;
-	short hits15, hits9 = 0, hits5 = 0;
+	int hits15, hits9 = 0, hits5 = 0;
 	bool has_bones15 = false, has_bones9 = false, has_bones5 = false;
 	static int last_mval[2] = {-100, -100};
 	bool do_nearest = false;
@@ -1234,44 +1239,57 @@ static short mixed_bones_object_selectbuffer(
 
 	do_nearest = do_nearest && !enumerate;
 
-	BLI_rcti_init(&rect, mval[0] - 14, mval[0] + 14, mval[1] - 14, mval[1] + 14);
-	hits15 = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, do_nearest);
+	const int select_mode = (do_nearest ? VIEW3D_SELECT_PICK_NEAREST : VIEW3D_SELECT_PICK_ALL);
+	int hits = 0;
+
+	/* we _must_ end cache before return, use 'goto finally' */
+	GPU_select_cache_begin();
+
+	BLI_rcti_init_pt_radius(&rect, mval, 14);
+	hits15 = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, select_mode);
 	if (hits15 == 1) {
-		return selectbuffer_ret_hits_15(buffer, hits15);
+		hits = selectbuffer_ret_hits_15(buffer, hits15);
+		goto finally;
 	}
 	else if (hits15 > 0) {
+		int offs;
 		has_bones15 = selectbuffer_has_bones(buffer, hits15);
 
 		offs = 4 * hits15;
-		BLI_rcti_init(&rect, mval[0] - 9, mval[0] + 9, mval[1] - 9, mval[1] + 9);
-		hits9 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, do_nearest);
+		BLI_rcti_init_pt_radius(&rect, mval, 9);
+		hits9 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, select_mode);
 		if (hits9 == 1) {
-			return selectbuffer_ret_hits_9(buffer, hits15, hits9);
+			hits = selectbuffer_ret_hits_9(buffer, hits15, hits9);
+			goto finally;
 		}
 		else if (hits9 > 0) {
 			has_bones9 = selectbuffer_has_bones(buffer + offs, hits9);
 
 			offs += 4 * hits9;
-			BLI_rcti_init(&rect, mval[0] - 5, mval[0] + 5, mval[1] - 5, mval[1] + 5);
-			hits5 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, do_nearest);
+			BLI_rcti_init_pt_radius(&rect, mval, 5);
+			hits5 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, select_mode);
 			if (hits5 == 1) {
-				return selectbuffer_ret_hits_5(buffer, hits15, hits9, hits5);
+				hits = selectbuffer_ret_hits_5(buffer, hits15, hits9, hits5);
+				goto finally;
 			}
 			else if (hits5 > 0) {
 				has_bones5 = selectbuffer_has_bones(buffer + offs, hits5);
 			}
 		}
 
-		if      (has_bones5)  return selectbuffer_ret_hits_5(buffer,  hits15, hits9, hits5);
-		else if (has_bones9)  return selectbuffer_ret_hits_9(buffer,  hits15, hits9);
-		else if (has_bones15) return selectbuffer_ret_hits_15(buffer, hits15);
-		
-		if      (hits5 > 0) return selectbuffer_ret_hits_5(buffer,  hits15, hits9, hits5);
-		else if (hits9 > 0) return selectbuffer_ret_hits_9(buffer,  hits15, hits9);
-		else                return selectbuffer_ret_hits_15(buffer, hits15);
+		if      (has_bones5)  { hits = selectbuffer_ret_hits_5(buffer,  hits15, hits9, hits5); goto finally; }
+		else if (has_bones9)  { hits = selectbuffer_ret_hits_9(buffer,  hits15, hits9); goto finally; }
+		else if (has_bones15) { hits = selectbuffer_ret_hits_15(buffer, hits15); goto finally; }
+
+		if      (hits5 > 0) { hits = selectbuffer_ret_hits_5(buffer,  hits15, hits9, hits5); goto finally; }
+		else if (hits9 > 0) { hits = selectbuffer_ret_hits_9(buffer,  hits15, hits9); goto finally; }
+		else                { hits = selectbuffer_ret_hits_15(buffer, hits15); goto finally; }
 	}
-	
-	return 0;
+
+finally:
+	GPU_select_cache_end();
+
+	return hits;
 }
 
 /* returns basact */
@@ -1412,7 +1430,7 @@ static bool ed_object_select_pick(
 	bool is_obedit;
 	float dist = ED_view3d_select_dist_px() * 1.3333f;
 	bool retval = false;
-	short hits;
+	int hits;
 	const float mval_fl[2] = {(float)mval[0], (float)mval[1]};
 
 	
@@ -1464,10 +1482,13 @@ static bool ed_object_select_pick(
 		unsigned int buffer[MAXPICKBUF];
 		bool do_nearest;
 
+		// TIMEIT_START(select_time);
+
 		/* if objects have posemode set, the bones are in the same selection buffer */
-		
 		hits = mixed_bones_object_selectbuffer(&vc, buffer, mval, true, enumerate, &do_nearest);
-		
+
+		// TIMEIT_END(select_time);
+
 		if (hits > 0) {
 			/* note: bundles are handling in the same way as bones */
 			const bool has_bones = selectbuffer_has_bones(buffer, hits);
@@ -1904,9 +1925,9 @@ static int do_meta_box_select(ViewContext *vc, rcti *rect, bool select, bool ext
 	int a;
 
 	unsigned int buffer[MAXPICKBUF];
-	short hits;
+	int hits;
 
-	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, false);
+	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, VIEW3D_SELECT_ALL);
 
 	if (extend == false && select)
 		BKE_mball_deselect_all(mb);
@@ -1938,9 +1959,9 @@ static int do_armature_box_select(ViewContext *vc, rcti *rect, bool select, bool
 	int a;
 
 	unsigned int buffer[MAXPICKBUF];
-	short hits;
+	int hits;
 
-	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, false);
+	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, VIEW3D_SELECT_ALL);
 	
 	/* clear flag we use to detect point was affected */
 	for (ebone = arm->edbo->first; ebone; ebone = ebone->next)
@@ -2013,7 +2034,7 @@ static int do_object_pose_box_select(bContext *C, ViewContext *vc, rcti *rect, b
 	int bone_only;
 	int bone_selected = 0;
 	int totobj = MAXPICKBUF; /* XXX solve later */
-	short hits;
+	int hits;
 	
 	if ((ob) && (ob->mode & OB_MODE_POSE))
 		bone_only = 1;
@@ -2037,7 +2058,7 @@ static int do_object_pose_box_select(bContext *C, ViewContext *vc, rcti *rect, b
 
 	/* selection buffer now has bones potentially too, so we add MAXPICKBUF */
 	vbuffer = MEM_mallocN(4 * (totobj + MAXPICKELEMS) * sizeof(unsigned int), "selection buffer");
-	hits = view3d_opengl_select(vc, vbuffer, 4 * (totobj + MAXPICKELEMS), rect, false);
+	hits = view3d_opengl_select(vc, vbuffer, 4 * (totobj + MAXPICKELEMS), rect, VIEW3D_SELECT_ALL);
 	/*
 	 * LOGIC NOTES (theeth):
 	 * The buffer and ListBase have the same relative order, which makes the selection
@@ -2577,7 +2598,7 @@ static void lattice_circle_select(ViewContext *vc, const bool select, const int
 
 
 /* NOTE: pose-bone case is copied from editbone case... */
-static short pchan_circle_doSelectJoint(void *userData, bPoseChannel *pchan, const float screen_co[2])
+static bool pchan_circle_doSelectJoint(void *userData, bPoseChannel *pchan, const float screen_co[2])
 {
 	CircleSelectUserData *data = userData;
 
@@ -2655,7 +2676,7 @@ static void pose_circle_select(ViewContext *vc, const bool select, const int mva
 	}
 }
 
-static short armature_circle_doSelectJoint(void *userData, EditBone *ebone, const float screen_co[2], short head)
+static bool armature_circle_doSelectJoint(void *userData, EditBone *ebone, const float screen_co[2], bool head)
 {
 	CircleSelectUserData *data = userData;
 
diff --git a/source/blender/editors/space_view3d/view3d_view.c b/source/blender/editors/space_view3d/view3d_view.c
index 8582952d1a0..9d1a3633786 100644
--- a/source/blender/editors/space_view3d/view3d_view.c
+++ b/source/blender/editors/space_view3d/view3d_view.c
@@ -908,7 +908,7 @@ void ED_view3d_polygon_offset(const RegionView3D *rv3d, const float dist)
 /**
  * \param rect optional for picking (can be NULL).
  */
-void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rctf *rect)
+void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rcti *rect)
 {
 	RegionView3D *rv3d = ar->regiondata;
 	rctf viewplane;
@@ -1170,29 +1170,64 @@ static void view3d_select_loop(ViewContext *vc, Scene *scene, View3D *v3d, ARegi
  *
  * \note (vc->obedit == NULL) can be set to explicitly skip edit-object selection.
  */
-short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int bufsize, const rcti *input, bool do_nearest)
+int view3d_opengl_select(
+        ViewContext *vc, unsigned int *buffer, unsigned int bufsize, const rcti *input,
+        int select_mode)
 {
 	Scene *scene = vc->scene;
 	View3D *v3d = vc->v3d;
 	ARegion *ar = vc->ar;
-	rctf rect;
-	short hits;
+	rcti rect;
+	int hits;
 	const bool use_obedit_skip = (scene->obedit != NULL) && (vc->obedit == NULL);
-	const bool do_passes = do_nearest && GPU_select_query_check_active();
+	const bool is_pick_select = (U.gpu_select_pick_deph != 0);
+	const bool do_passes = (
+	        (is_pick_select == false) &&
+	        (select_mode == VIEW3D_SELECT_PICK_NEAREST) &&
+	        GPU_select_query_check_active());
+
+	char gpu_select_mode;
 
-	G.f |= G_PICKSEL;
-	
 	/* case not a border select */
 	if (input->xmin == input->xmax) {
-		rect.xmin = input->xmin - 12;  /* seems to be default value for bones only now */
-		rect.xmax = input->xmin + 12;
-		rect.ymin = input->ymin - 12;
-		rect.ymax = input->ymin + 12;
+		/* seems to be default value for bones only now */
+		BLI_rcti_init_pt_radius(&rect, (const int[2]){input->xmin, input->ymin}, 12);
 	}
 	else {
-		BLI_rctf_rcti_copy(&rect, input);
+		rect = *input;
 	}
-	
+
+	if (is_pick_select) {
+		if (is_pick_select && select_mode == VIEW3D_SELECT_PICK_NEAREST) {
+			gpu_select_mode = GPU_SELECT_PICK_NEAREST;
+		}
+		else if (is_pick_select && select_mode == VIEW3D_SELECT_PICK_ALL) {
+			gpu_select_mode = GPU_SELECT_PICK_ALL;
+		}
+		else {
+			gpu_select_mode = GPU_SELECT_ALL;
+		}
+	}
+	else {
+		if (do_passes) {
+			gpu_select_mode = GPU_SELECT_NEAREST_FIRST_PASS;
+		}
+		else {
+			gpu_select_mode = GPU_SELECT_ALL;
+		}
+	}
+
+	/* Re-use cache (rect must be smaller then the cached)
+	 * other context is assumed to be unchanged */
+	if (GPU_select_is_cached()) {
+		GPU_select_begin(buffer, bufsize, &rect, gpu_select_mode, 0);
+		GPU_select_cache_load_id();
+		hits = GPU_select_end();
+		goto finally;
+	}
+
+	G.f |= G_PICKSEL;
+
 	view3d_winmatrix_set(ar, v3d, &rect);
 	mul_m4_m4m4(vc->rv3d->persmat, vc->rv3d->winmat, vc->rv3d->viewmat);
 	
@@ -1204,10 +1239,7 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b
 	if (vc->rv3d->rflag & RV3D_CLIPPING)
 		ED_view3d_clipping_set(vc->rv3d);
 	
-	if (do_passes)
-		GPU_select_begin(buffer, bufsize, &rect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
-	else
-		GPU_select_begin(buffer, bufsize, &rect, GPU_SELECT_ALL, 0);
+	GPU_select_begin(buffer, bufsize, &rect, gpu_select_mode, 0);
 
 	view3d_select_loop(vc, scene, v3d, ar, use_obedit_skip);
 
@@ -1233,7 +1265,8 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b
 	
 	if (vc->rv3d->rflag & RV3D_CLIPPING)
 		ED_view3d_clipping_disable();
-	
+
+finally:
 	if (hits < 0) printf("Too many objects in select buffer\n");  /* XXX make error message */
 
 	return hits;
diff --git a/source/blender/editors/transform/transform.c b/source/blender/editors/transform/transform.c
index 1916f9b4dab..7d9063c3285 100644
--- a/source/blender/editors/transform/transform.c
+++ b/source/blender/editors/transform/transform.c
@@ -2176,7 +2176,14 @@ bool initTransform(bContext *C, TransInfo *t, wmOperator *op, const wmEvent *eve
 	calculateCenter(t);
 
 	if (event) {
-		initMouseInput(t, &t->mouse, t->center2d, event->mval, event->shift);
+		/* Initialize accurate transform to settings requested by keymap. */
+		bool use_accurate = false;
+		if ((prop = RNA_struct_find_property(op->ptr, "use_accurate")) && RNA_property_is_set(op->ptr, prop)) {
+			if (RNA_property_boolean_get(op->ptr, prop)) {
+				use_accurate = true;
+			}
+		}
+		initMouseInput(t, &t->mouse, t->center2d, event->mval, use_accurate);
 	}
 
 	switch (mode) {
diff --git a/source/blender/editors/transform/transform_manipulator.c b/source/blender/editors/transform/transform_manipulator.c
index e141724f2df..0a984d90ae3 100644
--- a/source/blender/editors/transform/transform_manipulator.c
+++ b/source/blender/editors/transform/transform_manipulator.c
@@ -1724,14 +1724,14 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
 {
 	View3D *v3d = sa->spacedata.first;
 	RegionView3D *rv3d = ar->regiondata;
-	rctf rect, selrect;
+	rcti rect;
 	GLuint buffer[64];      // max 4 items per select, so large enuf
 	short hits;
 	const bool is_picksel = true;
 	const bool do_passes = GPU_select_query_check_active();
 
 	/* XXX check a bit later on this... (ton) */
-	extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, rctf *rect);
+	extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, const rcti *rect);
 
 	/* when looking through a selected camera, the manipulator can be at the
 	 * exact same position as the view, skip so we don't break selection */
@@ -1743,15 +1743,13 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
 	rect.ymin = mval[1] - hotspot;
 	rect.ymax = mval[1] + hotspot;
 
-	selrect = rect;
-
 	view3d_winmatrix_set(ar, v3d, &rect);
 	mul_m4_m4m4(rv3d->persmat, rv3d->winmat, rv3d->viewmat);
 
 	if (do_passes)
-		GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
+		GPU_select_begin(buffer, 64, &rect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
 	else
-		GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_ALL, 0);
+		GPU_select_begin(buffer, 64, &rect, GPU_SELECT_ALL, 0);
 
 	/* do the drawing */
 	if (v3d->twtype & V3D_MANIP_ROTATE) {
@@ -1766,7 +1764,7 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
 	hits = GPU_select_end();
 
 	if (do_passes) {
-		GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
+		GPU_select_begin(buffer, 64, &rect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
 
 		/* do the drawing */
 		if (v3d->twtype & V3D_MANIP_ROTATE) {
@@ -1826,6 +1824,23 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
 	return 0;
 }
 
+static const char *manipulator_get_operator_name(int man_val)
+{
+	if (man_val & MAN_TRANS_C) {
+		return "TRANSFORM_OT_translate";
+	}
+	else if (man_val == MAN_ROT_T) {
+		return "TRANSFORM_OT_trackball";
+	}
+	else if (man_val & MAN_ROT_C) {
+		return "TRANSFORM_OT_rotate";
+	}
+	else if (man_val & MAN_SCALE_C) {
+		return "TRANSFORM_OT_resize";
+	}
+
+	return NULL;
+}
 
 /* return 0; nothing happened */
 int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
@@ -1846,11 +1861,24 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
 	// find the hotspots first test narrow hotspot
 	val = manipulator_selectbuf(sa, ar, event->mval, 0.5f * (float)U.tw_hotspot);
 	if (val) {
+		wmOperatorType *ot;
+		PointerRNA props_ptr;
+		PropertyRNA *prop;
+		const char *opname;
 
 		// drawflags still global, for drawing call above
 		drawflags = manipulator_selectbuf(sa, ar, event->mval, 0.2f * (float)U.tw_hotspot);
 		if (drawflags == 0) drawflags = val;
 
+		/* Planar constraint doesn't make sense for rotation, give other keymaps a chance */
+		if ((drawflags & MAN_ROT_C) && use_planar) {
+			return 0;
+		}
+
+		opname = manipulator_get_operator_name(drawflags);
+		ot = WM_operatortype_find(opname, true);
+		WM_operator_properties_create_ptr(&props_ptr, ot);
+
 		if (drawflags & MAN_TRANS_C) {
 			switch (drawflags) {
 				case MAN_TRANS_C:
@@ -1880,8 +1908,7 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
 						constraint_axis[2] = 1;
 					break;
 			}
-			RNA_boolean_set_array(op->ptr, "constraint_axis", constraint_axis);
-			WM_operator_name_call(C, "TRANSFORM_OT_translate", WM_OP_INVOKE_DEFAULT, op->ptr);
+			RNA_boolean_set_array(&props_ptr, "constraint_axis", constraint_axis);
 		}
 		else if (drawflags & MAN_SCALE_C) {
 			switch (drawflags) {
@@ -1910,22 +1937,10 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
 						constraint_axis[2] = 1;
 					break;
 			}
-			RNA_boolean_set_array(op->ptr, "constraint_axis", constraint_axis);
-			WM_operator_name_call(C, "TRANSFORM_OT_resize", WM_OP_INVOKE_DEFAULT, op->ptr);
+			RNA_boolean_set_array(&props_ptr, "constraint_axis", constraint_axis);
 		}
-		else if (drawflags == MAN_ROT_T) { /* trackball need special case, init is different */
-			/* Do not pass op->ptr!!! trackball has no "constraint" properties!
-			 * See [#34621], it's a miracle it did not cause more problems!!! */
-			/* However, we need to copy the "release_confirm" property, but only if defined, see T41112. */
-			PointerRNA props_ptr;
-			PropertyRNA *prop;
-			wmOperatorType *ot = WM_operatortype_find("TRANSFORM_OT_trackball", true);
-			WM_operator_properties_create_ptr(&props_ptr, ot);
-			if ((prop = RNA_struct_find_property(op->ptr, "release_confirm")) && RNA_property_is_set(op->ptr, prop)) {
-				RNA_property_boolean_set(&props_ptr, prop, RNA_property_boolean_get(op->ptr, prop));
-			}
-			WM_operator_name_call_ptr(C, ot, WM_OP_INVOKE_DEFAULT, &props_ptr);
-			WM_operator_properties_free(&props_ptr);
+		else if (drawflags == MAN_ROT_T) {
+			/* pass */
 		}
 		else if (drawflags & MAN_ROT_C) {
 			switch (drawflags) {
@@ -1939,9 +1954,25 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
 					constraint_axis[2] = 1;
 					break;
 			}
-			RNA_boolean_set_array(op->ptr, "constraint_axis", constraint_axis);
-			WM_operator_name_call(C, "TRANSFORM_OT_rotate", WM_OP_INVOKE_DEFAULT, op->ptr);
+			RNA_boolean_set_array(&props_ptr, "constraint_axis", constraint_axis);
+		}
+
+		/* pass operator properties on to transform operators */
+		prop = RNA_struct_find_property(op->ptr, "use_accurate");
+		if (RNA_property_is_set(op->ptr, prop)) {
+			RNA_property_boolean_set(&props_ptr, prop, RNA_property_boolean_get(op->ptr, prop));
+		}
+		prop = RNA_struct_find_property(op->ptr, "release_confirm");
+		if (RNA_property_is_set(op->ptr, prop)) {
+			RNA_property_boolean_set(&props_ptr, prop, RNA_property_boolean_get(op->ptr, prop));
 		}
+		prop = RNA_struct_find_property(op->ptr, "constraint_orientation");
+		if (RNA_property_is_set(op->ptr, prop)) {
+			RNA_property_enum_set(&props_ptr, prop, RNA_property_enum_get(op->ptr, prop));
+		}
+
+		WM_operator_name_call_ptr(C, ot, WM_OP_INVOKE_DEFAULT, &props_ptr);
+		WM_operator_properties_free(&props_ptr);
 	}
 	/* after transform, restore drawflags */
 	drawflags = 0xFFFF;
diff --git a/source/blender/editors/transform/transform_ops.c b/source/blender/editors/transform/transform_ops.c
index cbe58ddf586..2a97384cf7d 100644
--- a/source/blender/editors/transform/transform_ops.c
+++ b/source/blender/editors/transform/transform_ops.c
@@ -569,6 +569,9 @@ void Transform_Properties(struct wmOperatorType *ot, int flags)
 		// Add confirm method all the time. At the end because it's not really that important and should be hidden only in log, not in keymap edit
 		/*prop =*/ RNA_def_boolean(ot->srna, "release_confirm", 0, "Confirm on Release", "Always confirm operation when releasing button");
 		//RNA_def_property_flag(prop, PROP_HIDDEN);
+
+		prop = RNA_def_boolean(ot->srna, "use_accurate", 0, "Accurate", "Use accurate transformation");
+		RNA_def_property_flag(prop, PROP_HIDDEN);
 	}
 }
 
diff --git a/source/blender/editors/transform/transform_snap_object.c b/source/blender/editors/transform/transform_snap_object.c
index 7c9dc43dbe4..cf16bb8817d 100644
--- a/source/blender/editors/transform/transform_snap_object.c
+++ b/source/blender/editors/transform/transform_snap_object.c
@@ -87,6 +87,8 @@ typedef struct SnapObjectData {
 typedef struct SnapObjectData_Mesh {
 	SnapObjectData sd;
 	BVHTreeFromMesh *bvh_trees[3];
+	MPoly *mpoly;
+	bool poly_allocated;
 
 } SnapObjectData_Mesh;
 
@@ -1051,7 +1053,6 @@ static int dm_looptri_to_poly_index(DerivedMesh *dm, const MLoopTri *lt)
 static bool snapDerivedMesh(
         SnapObjectContext *sctx, SnapData *snapdata,
         Object *ob, DerivedMesh *dm, float obmat[4][4], const unsigned int ob_index,
-        bool do_bb,
         /* read/write args */
         float *ray_depth, float *dist_px,
         /* return args */
@@ -1112,39 +1113,31 @@ static bool snapDerivedMesh(
 	copy_v3_v3(ray_org_local, snapdata->ray_origin);
 	mul_m4_v3(imat, ray_org_local);
 
-	if (do_bb) {
-		BoundBox *bb = BKE_object_boundbox_get(ob);
-
-		if (bb) {
-			BoundBox bb_temp;
-
-			/* We cannot afford a bounding box with some null dimension, which may happen in some cases...
-			 * Threshold is rather high, but seems to be needed to get good behavior, see T46099. */
-			bb = BKE_boundbox_ensure_minimum_dimensions(bb, &bb_temp, 1e-1f);
-
-			/* In vertex and edges you need to get the pixel distance from ray to BoundBox, see T46816. */
-			if (ELEM(snapdata->snap_to, SCE_SNAP_MODE_VERTEX, SCE_SNAP_MODE_EDGE)) {
-				float dist_px_sq = dist_squared_to_projected_aabb_simple(
-					    lpmat, snapdata->win_half, ray_min_dist, snapdata->mval,
-					    ray_org_local, ray_normal_local, bb->vec[0], bb->vec[6]);
-				if (dist_px_sq > SQUARE(*dist_px))
-				{
-					return retval;
-				}
+	/* Test BoundBox */
+	BoundBox *bb = BKE_object_boundbox_get(ob);
+	if (bb) {
+		/* In vertex and edges you need to get the pixel distance from ray to BoundBox, see: T46099, T46816 */
+		if (ELEM(snapdata->snap_to, SCE_SNAP_MODE_VERTEX, SCE_SNAP_MODE_EDGE)) {
+			float dist_px_sq = dist_squared_to_projected_aabb_simple(
+				    lpmat, snapdata->win_half, ray_min_dist, snapdata->mval,
+				    ray_org_local, ray_normal_local, bb->vec[0], bb->vec[6]);
+			if (dist_px_sq > SQUARE(*dist_px))
+			{
+				return retval;
 			}
-			else {
-				/* was BKE_boundbox_ray_hit_check, see: cf6ca226fa58 */
-				if (!isect_ray_aabb_v3_simple(
-					ray_start_local, ray_normal_local, bb->vec[0], bb->vec[6], NULL, NULL))
-				{
-					return retval;
-				}
+		}
+		else {
+			/* was BKE_boundbox_ray_hit_check, see: cf6ca226fa58 */
+			if (!isect_ray_aabb_v3_simple(
+				ray_start_local, ray_normal_local, bb->vec[0], bb->vec[6], NULL, NULL))
+			{
+				return retval;
 			}
-			/* was local_depth, see: T47838 */
-			len_diff = dist_aabb_to_plane(bb->vec[0], bb->vec[6], ray_start_local, ray_normal_local);
-			if (len_diff < 0) len_diff = 0.0f;
-			need_ray_start_correction_init = false;
 		}
+		/* was local_depth, see: T47838 */
+		len_diff = dist_aabb_to_plane(bb->vec[0], bb->vec[6], ray_start_local, ray_normal_local);
+		if (len_diff < 0) len_diff = 0.0f;
+		need_ray_start_correction_init = false;
 	}
 
 	SnapObjectData_Mesh *sod = NULL;
@@ -1182,6 +1175,29 @@ static bool snapDerivedMesh(
 			if (treedata->cached && !bvhcache_has_tree(dm->bvhCache, treedata->tree)) {
 				free_bvhtree_from_mesh(treedata);
 			}
+			else {
+				if (!treedata->vert_allocated) {
+					treedata->vert = DM_get_vert_array(dm, &treedata->vert_allocated);
+				}
+				if ((tree_index == 1) && !treedata->edge_allocated) {
+					treedata->edge = DM_get_edge_array(dm, &treedata->vert_allocated);
+				}
+				if (tree_index == 2) {
+					if (!treedata->loop_allocated) {
+						treedata->loop = DM_get_loop_array(dm, &treedata->loop_allocated);
+					}
+					if (!treedata->looptri_allocated) {
+						if (!sod->poly_allocated) {
+							sod->mpoly = DM_get_poly_array(dm, &sod->poly_allocated);
+						}
+						treedata->looptri = DM_get_looptri_array(
+						        dm, treedata->vert,
+						        sod->mpoly, dm->getNumPolys(dm),
+						        treedata->loop, dm->getNumLoops(dm),
+						        &treedata->looptri_allocated);
+					}
+				}
+			}
 		}
 	}
 
@@ -1295,10 +1311,17 @@ static bool snapDerivedMesh(
 	}
 	/* SCE_SNAP_MODE_VERTEX or SCE_SNAP_MODE_EDGE */
 	else {
+
+		/* Warning: the depth_max is currently being used only in perspective view.
+		 * It is not correct to limit the maximum depth for elements obtained with nearest
+		 * since this limitation depends on the normal and the size of the occlusion face.
+		 * And more... ray_depth is being confused with Z-depth here... (varies only the precision) */
+		const float ray_depth_max_global = *ray_depth + snapdata->depth_range[0];
+
 		Nearest2dUserData neasrest2d = {
 		    .dist_px_sq = SQUARE(*dist_px),
 		    .r_axis_closest = {1.0f, 1.0f, 1.0f},
-		    .depth_range = {snapdata->depth_range[0], *ray_depth + snapdata->depth_range[0]},
+		    .depth_range = {snapdata->depth_range[0], ray_depth_max_global},
 		    .userdata = treedata,
 		    .get_edge_verts = (Nearest2DGetEdgeVertsCallback)get_dm_edge_verts,
 		    .copy_vert_no = (Nearest2DCopyVertNoCallback)copy_dm_vert_no,
@@ -1650,7 +1673,6 @@ static bool snapObject(
 			}
 			retval = snapDerivedMesh(
 			        sctx, snapdata, ob, dm, obmat, ob_index,
-			        true,
 			        ray_depth, dist_px,
 			        r_loc, r_no,
 			        r_index, r_hit_list);
@@ -1858,6 +1880,9 @@ static void snap_object_data_free(void *sod_v)
 					free_bvhtree_from_mesh(sod->bvh_trees[i]);
 				}
 			}
+			if (sod->poly_allocated) {
+				MEM_freeN(sod->mpoly);
+			}
 			break;
 		}
 		case SNAP_EDIT_MESH:
diff --git a/source/blender/gpu/CMakeLists.txt b/source/blender/gpu/CMakeLists.txt
index 8885209ce01..885ff2ff159 100644
--- a/source/blender/gpu/CMakeLists.txt
+++ b/source/blender/gpu/CMakeLists.txt
@@ -57,6 +57,8 @@ set(SRC
 	intern/gpu_init_exit.c
 	intern/gpu_material.c
 	intern/gpu_select.c
+	intern/gpu_select_pick.c
+	intern/gpu_select_sample_query.c
 	intern/gpu_shader.c
 	intern/gpu_texture.c
 
@@ -97,6 +99,7 @@ set(SRC
 	GPU_texture.h
 	intern/gpu_codegen.h
 	intern/gpu_private.h
+	intern/gpu_select_private.h
 )
 
 data_to_c_simple(shaders/gpu_shader_geometry.glsl SRC)
diff --git a/source/blender/gpu/GPU_select.h b/source/blender/gpu/GPU_select.h
index 6a16b5b7456..cf5b8bf7d8f 100644
--- a/source/blender/gpu/GPU_select.h
+++ b/source/blender/gpu/GPU_select.h
@@ -30,19 +30,30 @@
 #ifndef __GPU_SELECT_H__
 #define __GPU_SELECT_H__
 
-#include "DNA_vec_types.h"  /* rcft */
 #include "BLI_sys_types.h"
 
+struct rcti;
+
 /* flags for mode of operation */
 enum {
 	GPU_SELECT_ALL                      = 1,
+	/* gpu_select_query */
 	GPU_SELECT_NEAREST_FIRST_PASS       = 2,
 	GPU_SELECT_NEAREST_SECOND_PASS      = 3,
+	/* gpu_select_pick */
+	GPU_SELECT_PICK_ALL           = 4,
+	GPU_SELECT_PICK_NEAREST       = 5,
 };
 
-void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, rctf *input, char mode, int oldhits);
+void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const struct rcti *input, char mode, int oldhits);
 bool GPU_select_load_id(unsigned int id);
 unsigned int GPU_select_end(void);
 bool GPU_select_query_check_active(void);
 
+/* cache selection region */
+bool GPU_select_is_cached(void);
+void GPU_select_cache_begin(void);
+void GPU_select_cache_load_id(void);
+void GPU_select_cache_end(void);
+
 #endif
diff --git a/source/blender/gpu/intern/gpu_select.c b/source/blender/gpu/intern/gpu_select.c
index 58582232cd5..9496ff137dc 100644
--- a/source/blender/gpu/intern/gpu_select.c
+++ b/source/blender/gpu/intern/gpu_select.c
@@ -29,109 +29,86 @@
  * Interface for accessing gpu-related methods for selection. The semantics will be
  * similar to glRenderMode(GL_SELECT) since the goal is to maintain compatibility.
  */
+#include <stdlib.h>
+
 #include "GPU_select.h"
 #include "GPU_extensions.h"
 #include "GPU_glew.h"
- 
+
 #include "MEM_guardedalloc.h"
 
 #include "DNA_userdef_types.h"
 
 #include "BLI_utildefines.h"
 
-/* Ad hoc number of queries to allocate to skip doing many glGenQueries */
-#define ALLOC_QUERIES 200
-
-typedef struct GPUQueryState {
+#include "gpu_select_private.h"
+
+/* Internal algorithm used */
+enum {
+	/** GL_SELECT, legacy OpenGL selection */
+	ALGO_GL_LEGACY = 1,
+	/** glBegin/EndQuery(GL_SAMPLES_PASSED... ), `gpu_select_query.c`
+	 * Only sets 4th component (ID) correctly. */
+	ALGO_GL_QUERY = 2,
+	/** Read depth buffer for every drawing pass and extract depths, `gpu_select_pick.c`
+	 * Only sets 4th component (ID) correctly. */
+	ALGO_GL_PICK = 3,
+};
+
+typedef struct GPUSelectState {
 	/* To ignore selection id calls when not initialized */
 	bool select_is_active;
-	/* Tracks whether a query has been issued so that gpu_load_id can end the previous one */
-	bool query_issued;
-	/* array holding the OpenGL query identifiers */
-	unsigned int *queries;
-	/* array holding the id corresponding to each query */
-	unsigned int *id;
-	/* number of queries in *queries and *id */
-	unsigned int num_of_queries;
-	/* index to the next query to start */
-	unsigned int active_query;
 	/* flag to cache user preference for occlusion based selection */
 	bool use_gpu_select;
-	/* cache on initialization */
-	unsigned int *buffer;
-	/* buffer size (stores number of integers, for actual size multiply by sizeof integer)*/
-	unsigned int bufsize;
 	/* mode of operation */
 	char mode;
-	unsigned int index;
-	int oldhits;
-} GPUQueryState;
+	/* internal algorithm for selection */
+	char algorithm;
+	/* allow GPU_select_begin/end without drawing */
+	bool use_cache;
+} GPUSelectState;
 
-static GPUQueryState g_query_state = {0};
+static GPUSelectState g_select_state = {0};
 
 /**
  * initialize and provide buffer for results
  */
-void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, rctf *input, char mode, int oldhits)
+void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const rcti *input, char mode, int oldhits)
 {
-	g_query_state.select_is_active = true;
-	g_query_state.query_issued = false;
-	g_query_state.active_query = 0;
-	g_query_state.use_gpu_select = GPU_select_query_check_active();
-	g_query_state.num_of_queries = 0;
-	g_query_state.bufsize = bufsize;
-	g_query_state.buffer = buffer;
-	g_query_state.mode = mode;
-	g_query_state.index = 0;
-	g_query_state.oldhits = oldhits;
+	g_select_state.select_is_active = true;
+	g_select_state.use_gpu_select = GPU_select_query_check_active();
+	g_select_state.mode = mode;
 
-	if (!g_query_state.use_gpu_select) {
-		glSelectBuffer(bufsize, (GLuint *)buffer);
-		glRenderMode(GL_SELECT);
-		glInitNames();
-		glPushName(-1);
+	if (ELEM(g_select_state.mode, GPU_SELECT_PICK_ALL, GPU_SELECT_PICK_NEAREST)) {
+		g_select_state.algorithm = ALGO_GL_PICK;
+	}
+	else if (!g_select_state.use_gpu_select) {
+		g_select_state.algorithm = ALGO_GL_LEGACY;
 	}
 	else {
-		float viewport[4];
-
-		g_query_state.num_of_queries = ALLOC_QUERIES;
-
-		g_query_state.queries = MEM_mallocN(g_query_state.num_of_queries * sizeof(*g_query_state.queries), "gpu selection queries");
-		g_query_state.id = MEM_mallocN(g_query_state.num_of_queries * sizeof(*g_query_state.id), "gpu selection ids");
-		glGenQueries(g_query_state.num_of_queries, g_query_state.queries);
-
-		glPushAttrib(GL_DEPTH_BUFFER_BIT | GL_VIEWPORT_BIT);
-		/* disable writing to the framebuffer */
-		glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
-
-		/* In order to save some fill rate we minimize the viewport using rect.
-		 * We need to get the region of the scissor so that our geometry doesn't
-		 * get rejected before the depth test. Should probably cull rect against
-		 * scissor for viewport but this is a rare case I think */
-		glGetFloatv(GL_SCISSOR_BOX, viewport);
-		if (!input || input->xmin == input->xmax) {
-			glViewport(viewport[0], viewport[1], 24, 24);
-		}
-		else {
-			glViewport(viewport[0], viewport[1], (int)(input->xmax - input->xmin), (int)(input->ymax - input->ymin));
-		}
+		g_select_state.algorithm = ALGO_GL_QUERY;
+	}
 
-		/* occlusion queries operates on fragments that pass tests and since we are interested on all
-		 * objects in the view frustum independently of their order, we need to disable the depth test */
-		if (mode == GPU_SELECT_ALL) {
-			glDisable(GL_DEPTH_TEST);
-			glDepthMask(GL_FALSE);
+	switch (g_select_state.algorithm) {
+		case ALGO_GL_LEGACY:
+		{
+			g_select_state.use_cache = false;
+			glSelectBuffer(bufsize, (GLuint *)buffer);
+			glRenderMode(GL_SELECT);
+			glInitNames();
+			glPushName(-1);
+			break;
 		}
-		else if (mode == GPU_SELECT_NEAREST_FIRST_PASS) {
-			glClear(GL_DEPTH_BUFFER_BIT);
-			glEnable(GL_DEPTH_TEST);
-			glDepthMask(GL_TRUE);
-			glDepthFunc(GL_LEQUAL);
+		case ALGO_GL_QUERY:
+		{
+			g_select_state.use_cache = false;
+			gpu_select_query_begin((unsigned int (*)[4])buffer, bufsize / 4, input, mode, oldhits);
+			break;
 		}
-		else if (mode == GPU_SELECT_NEAREST_SECOND_PASS) {
-			glEnable(GL_DEPTH_TEST);
-			glDepthMask(GL_FALSE);
-			glDepthFunc(GL_EQUAL);
+		default:  /* ALGO_GL_PICK */
+		{
+			gpu_select_pick_begin((unsigned int (*)[4])buffer, bufsize / 4, input, mode);
+			break;
 		}
 	}
 }
@@ -146,41 +123,24 @@ void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, rctf *input, c
 bool GPU_select_load_id(unsigned int id)
 {
 	/* if no selection mode active, ignore */
-	if (!g_query_state.select_is_active)
+	if (!g_select_state.select_is_active)
 		return true;
 
-	if (!g_query_state.use_gpu_select) {
-		glLoadName(id);
-	}
-	else {
-		if (g_query_state.query_issued) {
-			glEndQuery(GL_SAMPLES_PASSED);
+	switch (g_select_state.algorithm) {
+		case ALGO_GL_LEGACY:
+		{
+			glLoadName(id);
+			return true;
 		}
-		/* if required, allocate extra queries */
-		if (g_query_state.active_query == g_query_state.num_of_queries) {
-			g_query_state.num_of_queries += ALLOC_QUERIES;
-			g_query_state.queries = MEM_reallocN(g_query_state.queries, g_query_state.num_of_queries * sizeof(*g_query_state.queries));
-			g_query_state.id = MEM_reallocN(g_query_state.id, g_query_state.num_of_queries * sizeof(*g_query_state.id));
-			glGenQueries(ALLOC_QUERIES, &g_query_state.queries[g_query_state.active_query]);
+		case ALGO_GL_QUERY:
+		{
+			return gpu_select_query_load_id(id);
 		}
-
-		glBeginQuery(GL_SAMPLES_PASSED, g_query_state.queries[g_query_state.active_query]);
-		g_query_state.id[g_query_state.active_query] = id;
-		g_query_state.active_query++;
-		g_query_state.query_issued = true;
-
-		if (g_query_state.mode == GPU_SELECT_NEAREST_SECOND_PASS && g_query_state.index < g_query_state.oldhits) {
-			if (g_query_state.buffer[g_query_state.index * 4 + 3] == id) {
-				g_query_state.index++;
-				return true;
-			}
-			else {
-				return false;
-			}
+		default:  /* ALGO_GL_PICK */
+		{
+			return gpu_select_pick_load_id(id);
 		}
 	}
-
-	return true;
 }
 
 /**
@@ -191,59 +151,27 @@ bool GPU_select_load_id(unsigned int id)
 unsigned int GPU_select_end(void)
 {
 	unsigned int hits = 0;
-	if (!g_query_state.use_gpu_select) {
-		glPopName();
-		hits = glRenderMode(GL_RENDER);
-	}
-	else {
-		int i;
 
-		if (g_query_state.query_issued) {
-			glEndQuery(GL_SAMPLES_PASSED);
+	switch (g_select_state.algorithm) {
+		case ALGO_GL_LEGACY:
+		{
+			glPopName();
+			hits = glRenderMode(GL_RENDER);
+			break;
 		}
-
-		for (i = 0; i < g_query_state.active_query; i++) {
-			unsigned int result;
-			glGetQueryObjectuiv(g_query_state.queries[i], GL_QUERY_RESULT, &result);
-			if (result > 0) {
-				if (g_query_state.mode != GPU_SELECT_NEAREST_SECOND_PASS) {
-					int maxhits = g_query_state.bufsize / 4;
-
-					if (hits < maxhits) {
-						g_query_state.buffer[hits * 4] = 1;
-						g_query_state.buffer[hits * 4 + 1] = 0xFFFF;
-						g_query_state.buffer[hits * 4 + 2] = 0xFFFF;
-						g_query_state.buffer[hits * 4 + 3] = g_query_state.id[i];
-
-						hits++;
-					}
-					else {
-						hits = -1;
-						break;
-					}
-				}
-				else {
-					int j;
-					/* search in buffer and make selected object first */
-					for (j = 0; j < g_query_state.oldhits; j++) {
-						if (g_query_state.buffer[j * 4 + 3] == g_query_state.id[i]) {
-							g_query_state.buffer[j * 4 + 1] = 0;
-							g_query_state.buffer[j * 4 + 2] = 0;
-						}
-					}
-					break;
-				}
-			}
+		case ALGO_GL_QUERY:
+		{
+			hits = gpu_select_query_end();
+			break;
+		}
+		default:  /* ALGO_GL_PICK */
+		{
+			hits = gpu_select_pick_end();
+			break;
 		}
-
-		glDeleteQueries(g_query_state.num_of_queries, g_query_state.queries);
-		MEM_freeN(g_query_state.queries);
-		MEM_freeN(g_query_state.id);
-		glPopAttrib();
-		glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
 	}
 
-	g_query_state.select_is_active = false;
+	g_select_state.select_is_active = false;
 
 	return hits;
 }
@@ -260,3 +188,41 @@ bool GPU_select_query_check_active(void)
 	          GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_UNIX, GPU_DRIVER_OPENSOURCE))));
 
 }
+
+/* ----------------------------------------------------------------------------
+ * Caching
+ *
+ * Support multiple begin/end's as long as they are within the initial region.
+ * Currently only used by ALGO_GL_PICK.
+ */
+
+void GPU_select_cache_begin(void)
+{
+	/* validate on GPU_select_begin, clear if not supported */
+	BLI_assert(g_select_state.use_cache == false);
+	g_select_state.use_cache = true;
+	if (g_select_state.algorithm == ALGO_GL_PICK) {
+		gpu_select_pick_cache_begin();
+	}
+}
+
+void GPU_select_cache_load_id(void)
+{
+	BLI_assert(g_select_state.use_cache == true);
+	if (g_select_state.algorithm == ALGO_GL_PICK) {
+		gpu_select_pick_cache_load_id();
+	}
+}
+
+void GPU_select_cache_end(void)
+{
+	if (g_select_state.algorithm == ALGO_GL_PICK) {
+		gpu_select_pick_cache_end();
+	}
+	g_select_state.use_cache = false;
+}
+
+bool GPU_select_is_cached(void)
+{
+	return g_select_state.use_cache && gpu_select_pick_is_cached();
+}
diff --git a/source/blender/gpu/intern/gpu_select_pick.c b/source/blender/gpu/intern/gpu_select_pick.c
new file mode 100644
index 00000000000..31f82fd002d
--- /dev/null
+++ b/source/blender/gpu/intern/gpu_select_pick.c
@@ -0,0 +1,718 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2017 Blender Foundation.
+ * All rights reserved.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/gpu/intern/gpu_select_pick.c
+ *  \ingroup gpu
+ *
+ * Custom select code for picking small regions (not efficient for large regions).
+ * `gpu_select_pick_*` API.
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <float.h>
+
+#include "GPU_select.h"
+#include "GPU_extensions.h"
+#include "GPU_glew.h"
+ 
+#include "MEM_guardedalloc.h"
+
+#include "BLI_rect.h"
+#include "BLI_listbase.h"
+#include "BLI_math_vector.h"
+#include "BLI_utildefines.h"
+
+#include "gpu_select_private.h"
+
+/* #define DEBUG_PRINT */
+
+/* Alloc number for depths */
+#define ALLOC_DEPTHS 200
+
+/* Z-depth of cleared depth buffer */
+#define DEPTH_MAX 0xffffffff
+
+/* ----------------------------------------------------------------------------
+ * SubRectStride
+ */
+
+/* For looping over a sub-region of a rect, could be moved into 'rct.c'*/
+typedef struct SubRectStride {
+	unsigned int start;     /* start here */
+	unsigned int span;      /* read these */
+	unsigned int span_len;  /* len times (read span 'len' times). */
+	unsigned int skip;      /* skip those */
+} SubRectStride;
+
+/* we may want to change back to float if uint isn't well supported */
+typedef unsigned int depth_t;
+
+/**
+ * Calculate values needed for looping over a sub-region (smaller buffer within a larger buffer).
+ *
+ * 'src' must be bigger than 'dst'.
+ */
+static void rect_subregion_stride_calc(const rcti *src, const rcti *dst, SubRectStride *r_sub)
+{
+	const int src_x = BLI_rcti_size_x(src);
+	// const int src_y = BLI_rcti_size_y(src);
+	const int dst_x = BLI_rcti_size_x(dst);
+	const int dst_y = BLI_rcti_size_y(dst);
+	const int x = dst->xmin - src->xmin;
+	const int y = dst->ymin - src->ymin;
+
+	BLI_assert(src->xmin <= dst->xmin && src->ymin <= dst->ymin &&
+	           src->ymax >= dst->ymax && src->ymax >= dst->ymax);
+	BLI_assert(x >= 0 && y >= 0);
+
+	r_sub->start = (src_x * y) + x;
+	r_sub->span = dst_x;
+	r_sub->span_len = dst_y;
+	r_sub->skip = src_x - dst_x;
+}
+
+/* ----------------------------------------------------------------------------
+ * DepthBufCache
+ *
+ * Result of reading glReadPixels,
+ * use for both cache and non-cached storage.
+ */
+
+/* store result of glReadPixels */
+typedef struct DepthBufCache {
+	struct DepthBufCache *next, *prev;
+	unsigned int id;
+	depth_t buf[0];
+} DepthBufCache;
+
+static DepthBufCache *depth_buf_malloc(unsigned int rect_len)
+{
+	DepthBufCache *rect = MEM_mallocN(sizeof(DepthBufCache) + sizeof(depth_t) * rect_len, __func__);
+	rect->id = SELECT_ID_NONE;
+	return rect;
+}
+
+static bool depth_buf_rect_depth_any(
+        const DepthBufCache *rect_depth,
+        unsigned int rect_len)
+{
+	const depth_t *curr = rect_depth->buf;
+	for (unsigned int i = 0; i < rect_len; i++, curr++) {
+		if (*curr != DEPTH_MAX) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static bool depth_buf_subrect_depth_any(
+        const DepthBufCache *rect_depth,
+        const SubRectStride *sub_rect)
+{
+	const depth_t *curr = rect_depth->buf + sub_rect->start;
+	for (unsigned int i = 0; i < sub_rect->span_len; i++) {
+		const depth_t *curr_end = curr + sub_rect->span;
+		for (; curr < curr_end; curr++, curr++) {
+			if (*curr != DEPTH_MAX) {
+				return true;
+			}
+		}
+		curr += sub_rect->skip;
+	}
+	return false;
+}
+
+static bool depth_buf_rect_not_equal(
+        const DepthBufCache *rect_depth_a, const DepthBufCache *rect_depth_b,
+        unsigned int rect_len)
+{
+	return memcmp(rect_depth_a->buf, rect_depth_b->buf, rect_len * sizeof(depth_t)) != 0;
+}
+
+/**
+ * Both buffers are the same size, just check if the sub-rect contains any differences.
+ */
+static bool depth_buf_subrect_not_equal(
+        const DepthBufCache *rect_src, const DepthBufCache *rect_dst,
+        const SubRectStride *sub_rect)
+{
+	/* same as above but different rect sizes */
+	const depth_t *prev = rect_src->buf + sub_rect->start;
+	const depth_t *curr = rect_dst->buf + sub_rect->start;
+	for (unsigned int i = 0; i < sub_rect->span_len; i++) {
+		const depth_t *curr_end = curr + sub_rect->span;
+		for (; curr < curr_end; prev++, curr++) {
+			if (*prev != *curr) {
+				return true;
+			}
+		}
+		prev += sub_rect->skip;
+		curr += sub_rect->skip;
+	}
+	return false;
+}
+
+/* ----------------------------------------------------------------------------
+ * DepthID
+ *
+ * Internal structure for storing hits.
+ */
+
+typedef struct DepthID {
+	unsigned int id;
+	depth_t depth;
+} DepthID;
+
+static int depth_id_cmp(const void *v1, const void *v2)
+{
+	const DepthID *d1 = v1, *d2 = v2;
+	if (d1->id < d2->id) {
+		return -1;
+	}
+	else if (d1->id > d2->id) {
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}
+
+static int depth_cmp(const void *v1, const void *v2)
+{
+	const DepthID *d1 = v1, *d2 = v2;
+	if (d1->depth < d2->depth) {
+		return -1;
+	}
+	else if (d1->depth > d2->depth) {
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}
+
+/* depth sorting */
+typedef struct GPUPickState {
+	/* cache on initialization */
+	unsigned int (*buffer)[4];
+
+	/* buffer size (stores number of integers, for actual size multiply by sizeof integer)*/
+	unsigned int bufsize;
+	/* mode of operation */
+	char mode;
+
+	/* OpenGL drawing, never use when (is_cached == true). */
+	struct {
+		/* The current depth, accumulated as we draw */
+		DepthBufCache *rect_depth;
+		/* Scratch buffer, avoid allocs every time (when not caching) */
+		DepthBufCache *rect_depth_test;
+
+		/* Pass to glReadPixels (x, y, w, h) */
+		int clip_readpixels[4];
+
+		/* Set after first draw */
+		bool is_init;
+		unsigned int prev_id;
+	} gl;
+
+	/* src: data stored in 'cache' and 'gl',
+	 * dst: use when cached region is smaller (where src -> dst isn't 1:1) */
+	struct {
+		rcti clip_rect;
+		unsigned int rect_len;
+	} src, dst;
+
+	/* Store cache between `GPU_select_cache_begin/end` */
+	bool use_cache;
+	bool is_cached;
+	struct {
+		/* Cleanup used for iterating over both source and destination buffers:
+		 * src.clip_rect -> dst.clip_rect */
+		SubRectStride sub_rect;
+
+		/* List of DepthBufCache, sized of 'src.clip_rect' */
+		ListBase bufs;
+	} cache;
+
+	/* Pickign methods */
+	union {
+		/* GPU_SELECT_PICK_ALL */
+		struct {
+			DepthID *hits;
+			unsigned int hits_len;
+			unsigned int hits_len_alloc;
+		} all;
+
+		/* GPU_SELECT_PICK_NEAREST */
+		struct {
+			unsigned int *rect_id;
+		} nearest;
+	};
+} GPUPickState;
+
+
+static GPUPickState g_pick_state = {0};
+
+void gpu_select_pick_begin(
+        unsigned int (*buffer)[4], unsigned int bufsize,
+        const rcti *input, char mode)
+{
+	GPUPickState *ps = &g_pick_state;
+
+#ifdef DEBUG_PRINT
+	printf("%s: mode=%d, use_cache=%d, is_cache=%d\n", __func__, mode, ps->use_cache, ps->is_cached);
+#endif
+
+	ps->bufsize = bufsize;
+	ps->buffer = buffer;
+	ps->mode = mode;
+
+	const unsigned int rect_len = BLI_rcti_size_x(input) * BLI_rcti_size_y(input);
+	ps->dst.clip_rect = *input;
+	ps->dst.rect_len = rect_len;
+
+	/* Restrict OpenGL operations for when we don't have cache */
+	if (ps->is_cached == false) {
+
+		glPushAttrib(GL_DEPTH_BUFFER_BIT | GL_VIEWPORT_BIT);
+		/* disable writing to the framebuffer */
+		glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+
+		glEnable(GL_DEPTH_TEST);
+		glDepthMask(GL_TRUE);
+
+		if (mode == GPU_SELECT_PICK_ALL) {
+			glDepthFunc(GL_ALWAYS);
+		}
+		else {
+			glDepthFunc(GL_LEQUAL);
+		}
+
+		glPixelTransferi(GL_DEPTH_BIAS, 0.0);
+		glPixelTransferi(GL_DEPTH_SCALE, 1.0);
+
+
+		float viewport[4];
+		glGetFloatv(GL_SCISSOR_BOX, viewport);
+
+		ps->src.clip_rect = *input;
+		ps->src.rect_len = rect_len;
+
+		ps->gl.clip_readpixels[0] = viewport[0];
+		ps->gl.clip_readpixels[1] = viewport[1];
+		ps->gl.clip_readpixels[2] = BLI_rcti_size_x(&ps->src.clip_rect);
+		ps->gl.clip_readpixels[3] = BLI_rcti_size_y(&ps->src.clip_rect);
+
+		glViewport(UNPACK4(ps->gl.clip_readpixels));
+
+		/* It's possible we don't want to clear depth buffer,
+		 * so existing elements are masked by current z-buffer. */
+		glClear(GL_DEPTH_BUFFER_BIT);
+
+		/* scratch buffer (read new values here) */
+		ps->gl.rect_depth_test = depth_buf_malloc(rect_len);
+		ps->gl.rect_depth = depth_buf_malloc(rect_len);
+
+		/* set initial 'far' value */
+#if 0
+		glReadPixels(UNPACK4(ps->gl.clip_readpixels), GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, ps->gl.rect_depth->buf);
+#else
+		for (unsigned int i = 0; i < rect_len; i++) {
+			ps->gl.rect_depth->buf[i] = DEPTH_MAX;
+		}
+#endif
+
+		ps->gl.is_init = false;
+		ps->gl.prev_id = 0;
+	}
+	else {
+		/* Using cache (ps->is_cached == true) */
+		/* src.clip_rect -> dst.clip_rect */
+		rect_subregion_stride_calc(&ps->src.clip_rect, &ps->dst.clip_rect, &ps->cache.sub_rect);
+		BLI_assert(ps->gl.rect_depth == NULL);
+		BLI_assert(ps->gl.rect_depth_test == NULL);
+	}
+
+	if (mode == GPU_SELECT_PICK_ALL) {
+		ps->all.hits = MEM_mallocN(sizeof(*ps->all.hits) * ALLOC_DEPTHS, __func__);
+		ps->all.hits_len = 0;
+		ps->all.hits_len_alloc = ALLOC_DEPTHS;
+	}
+	else {
+		/* Set to 0xff for SELECT_ID_NONE */
+		ps->nearest.rect_id = MEM_mallocN(sizeof(unsigned int) * ps->dst.rect_len, __func__);
+		memset(ps->nearest.rect_id, 0xff, sizeof(unsigned int) * ps->dst.rect_len);
+	}
+}
+
+/**
+ * Given 2x depths, we know are different - update the depth information
+ * use for both cached/uncached depth buffers.
+ */
+static void gpu_select_load_id_pass_all(const DepthBufCache *rect_curr)
+{
+	GPUPickState *ps = &g_pick_state;
+	const unsigned int id = rect_curr->id;
+	/* find the best depth for this pass and store in 'all.hits' */
+	depth_t depth_best = DEPTH_MAX;
+
+#define EVAL_TEST() \
+	if (depth_best > *curr) { \
+		depth_best = *curr; \
+	} ((void)0)
+
+	if (ps->is_cached == false) {
+		const depth_t *curr = rect_curr->buf;
+		BLI_assert(ps->src.rect_len == ps->dst.rect_len);
+		const unsigned int rect_len = ps->src.rect_len;
+		for (unsigned int i = 0; i < rect_len; i++, curr++) {
+			EVAL_TEST();
+		}
+	}
+	else {
+		/* same as above but different rect sizes */
+		const depth_t *curr = rect_curr->buf + ps->cache.sub_rect.start;
+		for (unsigned int i = 0; i < ps->cache.sub_rect.span_len; i++) {
+			const depth_t *curr_end = curr + ps->cache.sub_rect.span;
+			for (; curr < curr_end; curr++) {
+				EVAL_TEST();
+			}
+			curr += ps->cache.sub_rect.skip;
+		}
+	}
+
+#undef EVAL_TEST
+
+	/* ensure enough space */
+	if (UNLIKELY(ps->all.hits_len == ps->all.hits_len_alloc)) {
+		ps->all.hits_len_alloc += ALLOC_DEPTHS;
+		ps->all.hits = MEM_reallocN(ps->all.hits, ps->all.hits_len_alloc * sizeof(*ps->all.hits));
+	}
+	DepthID *d = &ps->all.hits[ps->all.hits_len++];
+	d->id = id;
+	d->depth = depth_best;
+}
+
+static void gpu_select_load_id_pass_nearest(const DepthBufCache *rect_prev, const DepthBufCache *rect_curr)
+{
+	GPUPickState *ps = &g_pick_state;
+	const unsigned int id = rect_curr->id;
+	/* keep track each pixels ID in 'nearest.rect_id' */
+	if (id != SELECT_ID_NONE) {
+		unsigned int *id_ptr = ps->nearest.rect_id;
+
+#define EVAL_TEST() \
+		if (*curr != *prev) { \
+			*id_ptr = id; \
+		} ((void)0)
+
+		if (ps->is_cached == false) {
+			const depth_t *prev = rect_prev->buf;
+			const depth_t *curr = rect_curr->buf;
+			BLI_assert(ps->src.rect_len == ps->dst.rect_len);
+			const unsigned int rect_len = ps->src.rect_len;
+			for (unsigned int i = 0; i < rect_len; i++, curr++, prev++, id_ptr++) {
+				EVAL_TEST();
+			}
+		}
+		else {
+			/* same as above but different rect sizes */
+			const depth_t *prev = rect_prev->buf + ps->cache.sub_rect.start;
+			const depth_t *curr = rect_curr->buf + ps->cache.sub_rect.start;
+			for (unsigned int i = 0; i < ps->cache.sub_rect.span_len; i++) {
+				const depth_t *curr_end = curr + ps->cache.sub_rect.span;
+				for (; curr < curr_end; prev++, curr++, id_ptr++) {
+					EVAL_TEST();
+				}
+				prev += ps->cache.sub_rect.skip;
+				curr += ps->cache.sub_rect.skip;
+			}
+		}
+
+#undef EVAL_TEST
+	}
+}
+
+
+bool gpu_select_pick_load_id(unsigned int id)
+{
+	GPUPickState *ps = &g_pick_state;
+	if (ps->gl.is_init) {
+		const unsigned int rect_len = ps->src.rect_len;
+		glReadPixels(UNPACK4(ps->gl.clip_readpixels), GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, ps->gl.rect_depth_test->buf);
+		/* perform initial check since most cases the array remains unchanged  */
+
+		bool do_pass = false;
+		if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+			if (depth_buf_rect_depth_any(ps->gl.rect_depth_test, rect_len)) {
+				ps->gl.rect_depth_test->id = ps->gl.prev_id;
+				gpu_select_load_id_pass_all(ps->gl.rect_depth_test);
+				do_pass = true;
+			}
+		}
+		else {
+			if (depth_buf_rect_not_equal(ps->gl.rect_depth, ps->gl.rect_depth_test, rect_len)) {
+				ps->gl.rect_depth_test->id = ps->gl.prev_id;
+				gpu_select_load_id_pass_nearest(ps->gl.rect_depth, ps->gl.rect_depth_test);
+				do_pass = true;
+			}
+		}
+
+		if (do_pass) {
+			/* Store depth in cache */
+			if (ps->use_cache) {
+				BLI_addtail(&ps->cache.bufs, ps->gl.rect_depth);
+				ps->gl.rect_depth = depth_buf_malloc(ps->src.rect_len);
+			}
+
+			SWAP(DepthBufCache *, ps->gl.rect_depth, ps->gl.rect_depth_test);
+
+			if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+				/* we want new depths every time */
+				glClear(GL_DEPTH_BUFFER_BIT);
+			}
+		}
+	}
+
+	ps->gl.is_init = true;
+	ps->gl.prev_id = id;
+
+	return true;
+}
+
+unsigned int gpu_select_pick_end(void)
+{
+	GPUPickState *ps = &g_pick_state;
+
+#ifdef DEBUG_PRINT
+	printf("%s\n", __func__);
+#endif
+
+	if (ps->is_cached == false) {
+		if (ps->gl.is_init) {
+			/* force finishing last pass */
+			gpu_select_pick_load_id(ps->gl.prev_id);
+		}
+
+		glPopAttrib();
+		glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+	}
+
+	/* assign but never free directly since it may be in cache */
+	DepthBufCache *rect_depth_final;
+
+	/* Store depth in cache */
+	if (ps->use_cache && !ps->is_cached) {
+		BLI_addtail(&ps->cache.bufs, ps->gl.rect_depth);
+		ps->gl.rect_depth = NULL;
+		rect_depth_final = ps->cache.bufs.last;
+	}
+	else if (ps->is_cached) {
+		rect_depth_final = ps->cache.bufs.last;
+	}
+	else {
+		/* common case, no cache */
+		rect_depth_final = ps->gl.rect_depth;
+	}
+
+	unsigned int maxhits = g_pick_state.bufsize;
+	DepthID *depth_data;
+	unsigned int depth_data_len = 0;
+
+	if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+		depth_data = ps->all.hits;
+		depth_data_len = ps->all.hits_len;
+		/* move ownership */
+		ps->all.hits = NULL;
+		ps->all.hits_len = 0;
+		ps->all.hits_len_alloc = 0;
+	}
+	else {
+		/* GPU_SELECT_PICK_NEAREST */
+
+		/* Over alloc (unlikely we have as many depths as pixels) */
+		unsigned int depth_data_len_first_pass = 0;
+		depth_data = MEM_mallocN(ps->dst.rect_len * sizeof(*depth_data), __func__);
+
+		/* Partially de-duplicating copy,
+		 * when contiguous ID's are found - update their closest depth.
+		 * This isn't essential but means there is less data to sort. */
+
+#define EVAL_TEST(i_src, i_dst) \
+		{ \
+			const unsigned int id = ps->nearest.rect_id[i_dst]; \
+			if (id != SELECT_ID_NONE) { \
+				const depth_t depth = rect_depth_final->buf[i_src]; \
+				if (depth_last == NULL || depth_last->id != id) { \
+					DepthID *d = &depth_data[depth_data_len_first_pass++]; \
+					d->id = id; \
+					d->depth = depth; \
+				} \
+				else if (depth_last->depth > depth) { \
+					depth_last->depth = depth; \
+				} \
+			} \
+		} ((void)0)
+
+		{
+			DepthID *depth_last = NULL;
+			if (ps->is_cached == false) {
+				for (unsigned int i = 0; i < ps->src.rect_len; i++) {
+					EVAL_TEST(i, i);
+				}
+			}
+			else {
+				/* same as above but different rect sizes */
+				unsigned int i_src = ps->cache.sub_rect.start, i_dst = 0;
+				for (unsigned int j = 0; j < ps->cache.sub_rect.span_len; j++) {
+					const unsigned int i_src_end = i_src + ps->cache.sub_rect.span;
+					for (; i_src < i_src_end; i_src++, i_dst++) {
+						EVAL_TEST(i_src, i_dst);
+					}
+					i_src += ps->cache.sub_rect.skip;
+				}
+			}
+		}
+
+#undef EVAL_TEST
+
+		qsort(depth_data, depth_data_len_first_pass, sizeof(DepthID), depth_id_cmp);
+
+		/* Sort by ID's then keep the best depth for each ID */
+		depth_data_len = 0;
+		{
+			DepthID *depth_last = NULL;
+			for (unsigned int i = 0; i < depth_data_len_first_pass; i++) {
+				if (depth_last == NULL || depth_last->id != depth_data[i].id) {
+					depth_last = &depth_data[depth_data_len++];
+					*depth_last = depth_data[i];
+				}
+				else if (depth_last->depth > depth_data[i].depth) {
+					depth_last->depth = depth_data[i].depth;
+				}
+			}
+		}
+	}
+
+	/* Finally sort each unique (id, depth) pair by depth
+	 * so the final hit-list is sorted by depth (nearest first) */
+	unsigned int hits = 0;
+
+	if (depth_data_len > maxhits) {
+		hits = -1;
+	}
+	else {
+		qsort(depth_data, depth_data_len, sizeof(DepthID), depth_cmp);
+
+		for (unsigned int i = 0; i < depth_data_len; i++) {
+#ifdef DEBUG_PRINT
+			printf("  hit: %d: depth %u\n", depth_data[i].id,  depth_data[i].depth);
+#endif
+			/* first 3 are dummy values */
+			g_pick_state.buffer[hits][0] = 1;
+			g_pick_state.buffer[hits][1] = 0x0;
+			g_pick_state.buffer[hits][2] = 0x0;
+			g_pick_state.buffer[hits][3] = depth_data[i].id;
+			hits++;
+		}
+		BLI_assert(hits < maxhits);
+	}
+
+	MEM_freeN(depth_data);
+
+	MEM_SAFE_FREE(ps->gl.rect_depth);
+	MEM_SAFE_FREE(ps->gl.rect_depth_test);
+
+	if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+		/* 'hits' already freed as 'depth_data' */
+	}
+	else {
+		MEM_freeN(ps->nearest.rect_id);
+		ps->nearest.rect_id = NULL;
+	}
+
+	if (ps->use_cache) {
+		ps->is_cached = true;
+	}
+
+	return hits;
+}
+
+/* ----------------------------------------------------------------------------
+ * Caching
+ *
+ * Support multiple begin/end's reusing depth buffers.
+ */
+
+void gpu_select_pick_cache_begin(void)
+{
+	BLI_assert(g_pick_state.use_cache == false);
+#ifdef DEBUG_PRINT
+	printf("%s\n", __func__);
+#endif
+	g_pick_state.use_cache = true;
+	g_pick_state.is_cached = false;
+}
+
+void gpu_select_pick_cache_end(void)
+{
+#ifdef DEBUG_PRINT
+	printf("%s: with %d buffers\n", __func__, BLI_listbase_count(&g_pick_state.cache.bufs));
+#endif
+	g_pick_state.use_cache = false;
+	g_pick_state.is_cached = false;
+
+	BLI_freelistN(&g_pick_state.cache.bufs);
+}
+
+/* is drawing needed? */
+bool gpu_select_pick_is_cached(void)
+{
+	return g_pick_state.is_cached;
+}
+
+void gpu_select_pick_cache_load_id(void)
+{
+	BLI_assert(g_pick_state.is_cached == true);
+	GPUPickState *ps = &g_pick_state;
+#ifdef DEBUG_PRINT
+	printf("%s (building depth from cache)\n", __func__);
+#endif
+	for (DepthBufCache *rect_depth = ps->cache.bufs.first; rect_depth; rect_depth = rect_depth->next) {
+		if (rect_depth->next != NULL) {
+			/* we know the buffers differ, but this sub-region may not.
+			 * double check before adding an id-pass */
+			if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+				if (depth_buf_subrect_depth_any(rect_depth->next, &ps->cache.sub_rect)) {
+					gpu_select_load_id_pass_all(rect_depth->next);
+				}
+			}
+			else {
+				if (depth_buf_subrect_not_equal(rect_depth, rect_depth->next, &ps->cache.sub_rect)) {
+					gpu_select_load_id_pass_nearest(rect_depth, rect_depth->next);
+				}
+			}
+		}
+	}
+}
diff --git a/source/blender/gpu/intern/gpu_select_private.h b/source/blender/gpu/intern/gpu_select_private.h
new file mode 100644
index 00000000000..631b8806af9
--- /dev/null
+++ b/source/blender/gpu/intern/gpu_select_private.h
@@ -0,0 +1,48 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2014 Blender Foundation.
+ * All rights reserved.
+ *
+ * Contributor(s): Antony Riakiotakis.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/gpu/intern/gpu_select_private.h
+ *  \ingroup gpu
+ *
+ * Selection implementations.
+ */
+
+/* gpu_select_pick */
+void gpu_select_pick_begin(unsigned int (*buffer)[4], unsigned int bufsize, const rcti *input, char mode);
+bool gpu_select_pick_load_id(unsigned int id);
+unsigned int gpu_select_pick_end(void);
+
+void gpu_select_pick_cache_begin(void);
+void gpu_select_pick_cache_end(void);
+bool gpu_select_pick_is_cached(void);
+void gpu_select_pick_cache_load_id(void);
+
+/* gpu_select_sample_query */
+void gpu_select_query_begin(unsigned int (*buffer)[4], unsigned int bufsize, const rcti *input, char mode, int oldhits);
+bool gpu_select_query_load_id(unsigned int id);
+unsigned int gpu_select_query_end(void);
+
+
+#define SELECT_ID_NONE ((unsigned int)0xffffffff)
diff --git a/source/blender/gpu/intern/gpu_select_sample_query.c b/source/blender/gpu/intern/gpu_select_sample_query.c
new file mode 100644
index 00000000000..5576367edd9
--- /dev/null
+++ b/source/blender/gpu/intern/gpu_select_sample_query.c
@@ -0,0 +1,209 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2014 Blender Foundation.
+ * All rights reserved.
+ *
+ * Contributor(s): Antony Riakiotakis.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/gpu/intern/gpu_select.c
+ *  \ingroup gpu
+ *
+ * Interface for accessing gpu-related methods for selection. The semantics will be
+ * similar to glRenderMode(GL_SELECT) since the goal is to maintain compatibility.
+ */
+
+#include <stdlib.h>
+
+#include "GPU_select.h"
+#include "GPU_extensions.h"
+#include "GPU_glew.h"
+ 
+#include "MEM_guardedalloc.h"
+
+#include "BLI_rect.h"
+
+#include "BLI_utildefines.h"
+
+#include "gpu_select_private.h"
+
+
+/* Ad hoc number of queries to allocate to skip doing many glGenQueries */
+#define ALLOC_QUERIES 200
+
+typedef struct GPUQueryState {
+	/* Tracks whether a query has been issued so that gpu_load_id can end the previous one */
+	bool query_issued;
+	/* array holding the OpenGL query identifiers */
+	unsigned int *queries;
+	/* array holding the id corresponding to each query */
+	unsigned int *id;
+	/* number of queries in *queries and *id */
+	unsigned int num_of_queries;
+	/* index to the next query to start */
+	unsigned int active_query;
+	/* cache on initialization */
+	unsigned int (*buffer)[4];
+	/* buffer size (stores number of integers, for actual size multiply by sizeof integer)*/
+	unsigned int bufsize;
+	/* mode of operation */
+	char mode;
+	unsigned int index;
+	int oldhits;
+} GPUQueryState;
+
+static GPUQueryState g_query_state = {0};
+
+
+void gpu_select_query_begin(
+        unsigned int (*buffer)[4], unsigned int bufsize,
+        const rcti *input, char mode,
+        int oldhits)
+{
+	float viewport[4];
+
+	g_query_state.query_issued = false;
+	g_query_state.active_query = 0;
+	g_query_state.num_of_queries = 0;
+	g_query_state.bufsize = bufsize;
+	g_query_state.buffer = buffer;
+	g_query_state.mode = mode;
+	g_query_state.index = 0;
+	g_query_state.oldhits = oldhits;
+
+	g_query_state.num_of_queries = ALLOC_QUERIES;
+
+	g_query_state.queries = MEM_mallocN(g_query_state.num_of_queries * sizeof(*g_query_state.queries), "gpu selection queries");
+	g_query_state.id = MEM_mallocN(g_query_state.num_of_queries * sizeof(*g_query_state.id), "gpu selection ids");
+	glGenQueries(g_query_state.num_of_queries, g_query_state.queries);
+
+	glPushAttrib(GL_DEPTH_BUFFER_BIT | GL_VIEWPORT_BIT);
+	/* disable writing to the framebuffer */
+	glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+
+	/* In order to save some fill rate we minimize the viewport using rect.
+	 * We need to get the region of the scissor so that our geometry doesn't
+	 * get rejected before the depth test. Should probably cull rect against
+	 * scissor for viewport but this is a rare case I think */
+	glGetFloatv(GL_SCISSOR_BOX, viewport);
+	glViewport(viewport[0], viewport[1], BLI_rcti_size_x(input), BLI_rcti_size_y(input));
+
+	/* occlusion queries operates on fragments that pass tests and since we are interested on all
+	 * objects in the view frustum independently of their order, we need to disable the depth test */
+	if (mode == GPU_SELECT_ALL) {
+		glDisable(GL_DEPTH_TEST);
+		glDepthMask(GL_FALSE);
+	}
+	else if (mode == GPU_SELECT_NEAREST_FIRST_PASS) {
+		glClear(GL_DEPTH_BUFFER_BIT);
+		glEnable(GL_DEPTH_TEST);
+		glDepthMask(GL_TRUE);
+		glDepthFunc(GL_LEQUAL);
+	}
+	else if (mode == GPU_SELECT_NEAREST_SECOND_PASS) {
+		glEnable(GL_DEPTH_TEST);
+		glDepthMask(GL_FALSE);
+		glDepthFunc(GL_EQUAL);
+	}
+}
+
+bool gpu_select_query_load_id(unsigned int id)
+{
+	if (g_query_state.query_issued) {
+		glEndQuery(GL_SAMPLES_PASSED);
+	}
+	/* if required, allocate extra queries */
+	if (g_query_state.active_query == g_query_state.num_of_queries) {
+		g_query_state.num_of_queries += ALLOC_QUERIES;
+		g_query_state.queries = MEM_reallocN(g_query_state.queries, g_query_state.num_of_queries * sizeof(*g_query_state.queries));
+		g_query_state.id = MEM_reallocN(g_query_state.id, g_query_state.num_of_queries * sizeof(*g_query_state.id));
+		glGenQueries(ALLOC_QUERIES, &g_query_state.queries[g_query_state.active_query]);
+	}
+
+	glBeginQuery(GL_SAMPLES_PASSED, g_query_state.queries[g_query_state.active_query]);
+	g_query_state.id[g_query_state.active_query] = id;
+	g_query_state.active_query++;
+	g_query_state.query_issued = true;
+
+	if (g_query_state.mode == GPU_SELECT_NEAREST_SECOND_PASS && g_query_state.index < g_query_state.oldhits) {
+		if (g_query_state.buffer[g_query_state.index][3] == id) {
+			g_query_state.index++;
+			return true;
+		}
+		else {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+unsigned int gpu_select_query_end(void)
+{
+	int i;
+
+	unsigned int hits = 0;
+	const unsigned int maxhits = g_query_state.bufsize;
+
+	if (g_query_state.query_issued) {
+		glEndQuery(GL_SAMPLES_PASSED);
+	}
+
+	for (i = 0; i < g_query_state.active_query; i++) {
+		unsigned int result;
+		glGetQueryObjectuiv(g_query_state.queries[i], GL_QUERY_RESULT, &result);
+		if (result > 0) {
+			if (g_query_state.mode != GPU_SELECT_NEAREST_SECOND_PASS) {
+
+				if (hits < maxhits) {
+					g_query_state.buffer[hits][0] = 1;
+					g_query_state.buffer[hits][1] = 0xFFFF;
+					g_query_state.buffer[hits][2] = 0xFFFF;
+					g_query_state.buffer[hits][3] = g_query_state.id[i];
+
+					hits++;
+				}
+				else {
+					hits = -1;
+					break;
+				}
+			}
+			else {
+				int j;
+				/* search in buffer and make selected object first */
+				for (j = 0; j < g_query_state.oldhits; j++) {
+					if (g_query_state.buffer[j][3] == g_query_state.id[i]) {
+						g_query_state.buffer[j][1] = 0;
+						g_query_state.buffer[j][2] = 0;
+					}
+				}
+				break;
+			}
+		}
+	}
+
+	glDeleteQueries(g_query_state.num_of_queries, g_query_state.queries);
+	MEM_freeN(g_query_state.queries);
+	MEM_freeN(g_query_state.id);
+	glPopAttrib();
+	glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+
+	return hits;
+}
diff --git a/source/blender/imbuf/intern/imbuf.h b/source/blender/imbuf/intern/imbuf.h
index 897a149a45c..90dad70fa61 100644
--- a/source/blender/imbuf/intern/imbuf.h
+++ b/source/blender/imbuf/intern/imbuf.h
@@ -67,8 +67,6 @@
 #  define BIG_LONG SWAP_LONG
 #endif
 
-typedef unsigned char uchar;
-
 #define IMB_DPI_DEFAULT 72.0f
 
 #endif	/* __IMBUF_H__ */
diff --git a/source/blender/makesdna/DNA_modifier_types.h b/source/blender/makesdna/DNA_modifier_types.h
index f95533a88f9..32b43c7ea55 100644
--- a/source/blender/makesdna/DNA_modifier_types.h
+++ b/source/blender/makesdna/DNA_modifier_types.h
@@ -86,6 +86,7 @@ typedef enum ModifierType {
 	eModifierType_NormalEdit        = 50,
 	eModifierType_CorrectiveSmooth  = 51,
 	eModifierType_MeshSequenceCache = 52,
+	eModifierType_SurfaceDeform     = 53,
 	NUM_MODIFIER_TYPES
 } ModifierType;
 
@@ -1570,6 +1571,46 @@ enum {
 	MOD_MESHSEQ_READ_COLOR = (1 << 3),
 };
 
+typedef struct SDefBind {
+	unsigned int *vert_inds;
+	unsigned int numverts;
+	int mode;
+	float *vert_weights;
+	float normal_dist;
+	float influence;
+} SDefBind;
+
+typedef struct SDefVert {
+	SDefBind *binds;
+	unsigned int numbinds;
+	char pad[4];
+} SDefVert;
+
+typedef struct SurfaceDeformModifierData {
+	ModifierData modifier;
+
+	struct Object *target;	/* bind target object */
+	SDefVert *verts;		/* vertex bind data */
+	float falloff;
+	unsigned int numverts, numpoly;
+	int flags;
+	float mat[4][4];
+} SurfaceDeformModifierData;
+
+/* Surface Deform modifier flags */
+enum {
+	MOD_SDEF_BIND = (1 << 0),
+	MOD_SDEF_USES_LOOPTRI = (1 << 1),
+	MOD_SDEF_HAS_CONCAVE = (1 << 2),
+};
+
+/* Surface Deform vertex bind modes */
+enum {
+	MOD_SDEF_MODE_LOOPTRI = 0,
+	MOD_SDEF_MODE_NGON = 1,
+	MOD_SDEF_MODE_CENTROID = 2,
+};
+
 #define MOD_MESHSEQ_READ_ALL \
 	(MOD_MESHSEQ_READ_VERT | MOD_MESHSEQ_READ_POLY | MOD_MESHSEQ_READ_UV | MOD_MESHSEQ_READ_COLOR)
 
diff --git a/source/blender/makesdna/DNA_node_types.h b/source/blender/makesdna/DNA_node_types.h
index fd601e55550..47677e50451 100644
--- a/source/blender/makesdna/DNA_node_types.h
+++ b/source/blender/makesdna/DNA_node_types.h
@@ -668,7 +668,8 @@ typedef struct NodeScriptDict {
 /* qdn: glare node */
 typedef struct NodeGlare {
 	char quality, type, iter;
-	char angle, pad_c1, size, pad[2];
+	/* XXX angle is only kept for backward/forward compatibility, was used for two different things, see T50736. */
+	char angle DNA_DEPRECATED, pad_c1, size, star_45, streaks;
 	float colmod, mix, threshold, fade;
 	float angle_ofs, pad_f1;
 } NodeGlare;
diff --git a/source/blender/makesdna/DNA_object_force.h b/source/blender/makesdna/DNA_object_force.h
index 59acefeffe4..ed14c4b9311 100644
--- a/source/blender/makesdna/DNA_object_force.h
+++ b/source/blender/makesdna/DNA_object_force.h
@@ -372,6 +372,7 @@ typedef struct SoftBody {
 #define PFIELD_DO_ROTATION		(1<<15)
 #define PFIELD_GUIDE_PATH_WEIGHT (1<<16)	/* apply curve weights */
 #define PFIELD_SMOKE_DENSITY    (1<<17)		/* multiply smoke force by density */
+#define PFIELD_GRAVITATION		(1<<18)             /* used for (simple) force */
 
 /* pd->falloff */
 #define PFIELD_FALL_SPHERE		0
diff --git a/source/blender/makesdna/DNA_scene_types.h b/source/blender/makesdna/DNA_scene_types.h
index 8ee15ef21a3..918d0f00040 100644
--- a/source/blender/makesdna/DNA_scene_types.h
+++ b/source/blender/makesdna/DNA_scene_types.h
@@ -1716,6 +1716,7 @@ typedef struct Scene {
 #define SCER_LOCK_FRAME_SELECTION	(1<<1)
 	/* timeline/keyframe jumping - only selected items (on by default) */
 #define SCE_KEYS_NO_SELONLY	(1<<2)
+#define SCER_SHOW_SUBFRAME	(1<<3)
 
 /* mode (int now) */
 #define R_OSA			0x0001
diff --git a/source/blender/makesdna/DNA_userdef_types.h b/source/blender/makesdna/DNA_userdef_types.h
index 94cc7dd9892..fc970c40c12 100644
--- a/source/blender/makesdna/DNA_userdef_types.h
+++ b/source/blender/makesdna/DNA_userdef_types.h
@@ -498,7 +498,6 @@ typedef struct UserDef {
 	int prefetchframes;
 	float pad_rot_angle; /* control the rotation step of the view when PAD2, PAD4, PAD6&PAD8 is use */
 	short frameserverport;
-	short pad4;
 	short obcenter_dia;
 	short rvisize;			/* rotating view icon size */
 	short rvibright;		/* rotating view icon brightness */
@@ -510,6 +509,8 @@ typedef struct UserDef {
 	char  ipo_new;			/* interpolation mode for newly added F-Curves */
 	char  keyhandles_new;	/* handle types for newly added keyframes */
 	char  gpu_select_method;
+	char  gpu_select_pick_deph;
+	char  pad4;
 	char  view_frame_type;
 
 	int view_frame_keyframes; /* number of keyframes to zoom around current frame */
diff --git a/source/blender/makesrna/RNA_access.h b/source/blender/makesrna/RNA_access.h
index 66e6f30feeb..f9aaec69ce7 100644
--- a/source/blender/makesrna/RNA_access.h
+++ b/source/blender/makesrna/RNA_access.h
@@ -598,6 +598,7 @@ extern StructRNA RNA_StucciTexture;
 extern StructRNA RNA_SubsurfModifier;
 extern StructRNA RNA_SunLamp;
 extern StructRNA RNA_SurfaceCurve;
+extern StructRNA RNA_SurfaceDeformModifier;
 extern StructRNA RNA_SurfaceModifier;
 extern StructRNA RNA_TexMapping;
 extern StructRNA RNA_Text;
diff --git a/source/blender/makesrna/intern/rna_define.c b/source/blender/makesrna/intern/rna_define.c
index dc97d39052b..1d232d2df39 100644
--- a/source/blender/makesrna/intern/rna_define.c
+++ b/source/blender/makesrna/intern/rna_define.c
@@ -3157,8 +3157,9 @@ int rna_parameter_size(PropertyRNA *parm)
 					StringPropertyRNA *sparm = (StringPropertyRNA *)parm;
 					return sizeof(char) * sparm->maxlength;
 				}
-				else
+				else {
 					return sizeof(char *);
+				}
 			case PROP_POINTER:
 			{
 #ifdef RNA_RUNTIME
diff --git a/source/blender/makesrna/intern/rna_mesh_api.c b/source/blender/makesrna/intern/rna_mesh_api.c
index ff9873fb3d1..9b0a25560f9 100644
--- a/source/blender/makesrna/intern/rna_mesh_api.c
+++ b/source/blender/makesrna/intern/rna_mesh_api.c
@@ -209,6 +209,11 @@ static void rna_Mesh_flip_normals(Mesh *mesh)
 	DAG_id_tag_update(&mesh->id, 0);
 }
 
+static void rna_Mesh_split_faces(Mesh *mesh, int free_loop_normals)
+{
+	BKE_mesh_split_faces(mesh, free_loop_normals != 0);
+}
+
 #else
 
 void RNA_api_mesh(StructRNA *srna)
@@ -240,8 +245,10 @@ void RNA_api_mesh(StructRNA *srna)
 	func = RNA_def_function(srna, "free_normals_split", "rna_Mesh_free_normals_split");
 	RNA_def_function_ui_description(func, "Free split vertex normals");
 
-	func = RNA_def_function(srna, "split_faces", "BKE_mesh_split_faces");
+	func = RNA_def_function(srna, "split_faces", "rna_Mesh_split_faces");
 	RNA_def_function_ui_description(func, "Split faces based on the edge angle");
+	RNA_def_boolean(func, "free_loop_normals", 1, "Free Loop Notmals",
+	                "Free loop normals custom data layer");
 
 	func = RNA_def_function(srna, "calc_tangents", "rna_Mesh_calc_tangents");
 	RNA_def_function_flag(func, FUNC_USE_REPORTS);
diff --git a/source/blender/makesrna/intern/rna_modifier.c b/source/blender/makesrna/intern/rna_modifier.c
index c4f0db38a16..47c4b425155 100644
--- a/source/blender/makesrna/intern/rna_modifier.c
+++ b/source/blender/makesrna/intern/rna_modifier.c
@@ -105,6 +105,7 @@ EnumPropertyItem rna_enum_object_modifier_type_items[] = {
 	{eModifierType_Shrinkwrap, "SHRINKWRAP", ICON_MOD_SHRINKWRAP, "Shrinkwrap", ""},
 	{eModifierType_SimpleDeform, "SIMPLE_DEFORM", ICON_MOD_SIMPLEDEFORM, "Simple Deform", ""},
 	{eModifierType_Smooth, "SMOOTH", ICON_MOD_SMOOTH, "Smooth", ""},
+	{eModifierType_SurfaceDeform, "SURFACE_DEFORM", ICON_MOD_MESHDEFORM, "Surface Deform", ""},
 	{eModifierType_Warp, "WARP", ICON_MOD_WARP, "Warp", ""},
 	{eModifierType_Wave, "WAVE", ICON_MOD_WAVE, "Wave", ""},
 	{0, "", 0, N_("Simulate"), ""},
@@ -408,6 +409,8 @@ static StructRNA *rna_Modifier_refine(struct PointerRNA *ptr)
 			return &RNA_CorrectiveSmoothModifier;
 		case eModifierType_MeshSequenceCache:
 			return &RNA_MeshSequenceCacheModifier;
+		case eModifierType_SurfaceDeform:
+			return &RNA_SurfaceDeformModifier;
 		/* Default */
 		case eModifierType_None:
 		case eModifierType_ShapeKey:
@@ -573,6 +576,7 @@ RNA_MOD_OBJECT_SET(MeshDeform, object, OB_MESH);
 RNA_MOD_OBJECT_SET(NormalEdit, target, OB_EMPTY);
 RNA_MOD_OBJECT_SET(Shrinkwrap, target, OB_MESH);
 RNA_MOD_OBJECT_SET(Shrinkwrap, auxTarget, OB_MESH);
+RNA_MOD_OBJECT_SET(SurfaceDeform, target, OB_MESH);
 
 static void rna_HookModifier_object_set(PointerRNA *ptr, PointerRNA value)
 {
@@ -1131,6 +1135,11 @@ static int rna_CorrectiveSmoothModifier_is_bind_get(PointerRNA *ptr)
 	return (csmd->bind_coords != NULL);
 }
 
+static int rna_SurfaceDeformModifier_is_bound_get(PointerRNA *ptr)
+{
+	return (((SurfaceDeformModifierData *)ptr->data)->verts != NULL);
+}
+
 static void rna_MeshSequenceCache_object_path_update(Main *bmain, Scene *scene, PointerRNA *ptr)
 {
 #ifdef WITH_ALEMBIC
@@ -4702,6 +4711,33 @@ static void rna_def_modifier_normaledit(BlenderRNA *brna)
 	RNA_def_property_update(prop, 0, "rna_Modifier_update");
 }
 
+static void rna_def_modifier_surfacedeform(BlenderRNA *brna)
+{
+	StructRNA *srna;
+	PropertyRNA *prop;
+
+	srna = RNA_def_struct(brna, "SurfaceDeformModifier", "Modifier");
+	RNA_def_struct_ui_text(srna, "SurfaceDeform Modifier", "blablabla");
+	RNA_def_struct_sdna(srna, "SurfaceDeformModifierData");
+	RNA_def_struct_ui_icon(srna, ICON_MOD_MESHDEFORM);
+
+	prop = RNA_def_property(srna, "target", PROP_POINTER, PROP_NONE);
+	RNA_def_property_ui_text(prop, "Target", "Mesh object to deform with");
+	RNA_def_property_pointer_funcs(prop, NULL, "rna_SurfaceDeformModifier_target_set", NULL, "rna_Mesh_object_poll");
+	RNA_def_property_flag(prop, PROP_EDITABLE | PROP_ID_SELF_CHECK);
+	RNA_def_property_update(prop, 0, "rna_Modifier_dependency_update");
+
+	prop = RNA_def_property(srna, "falloff", PROP_FLOAT, PROP_NONE);
+	RNA_def_property_range(prop, 2.0f, 16.0f);
+	RNA_def_property_ui_text(prop, "Interpolation falloff", "Controls how much nearby polygons influence deformation");
+	RNA_def_property_update(prop, 0, "rna_Modifier_update");
+
+	prop = RNA_def_property(srna, "is_bound", PROP_BOOLEAN, PROP_NONE);
+	RNA_def_property_boolean_funcs(prop, "rna_SurfaceDeformModifier_is_bound_get", NULL);
+	RNA_def_property_ui_text(prop, "Bound", "Whether geometry has been bound to target mesh");
+	RNA_def_property_clear_flag(prop, PROP_EDITABLE);
+}
+
 void RNA_def_modifier(BlenderRNA *brna)
 {
 	StructRNA *srna;
@@ -4819,6 +4855,7 @@ void RNA_def_modifier(BlenderRNA *brna)
 	rna_def_modifier_datatransfer(brna);
 	rna_def_modifier_normaledit(brna);
 	rna_def_modifier_meshseqcache(brna);
+	rna_def_modifier_surfacedeform(brna);
 }
 
 #endif
diff --git a/source/blender/makesrna/intern/rna_nodetree.c b/source/blender/makesrna/intern/rna_nodetree.c
index b35142f2a58..784004182dd 100644
--- a/source/blender/makesrna/intern/rna_nodetree.c
+++ b/source/blender/makesrna/intern/rna_nodetree.c
@@ -5721,8 +5721,8 @@ static void def_cmp_glare(StructRNA *srna)
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 	
 	prop = RNA_def_property(srna, "streaks", PROP_INT, PROP_NONE);
-	RNA_def_property_int_sdna(prop, NULL, "angle");
-	RNA_def_property_range(prop, 2, 16);
+	RNA_def_property_int_sdna(prop, NULL, "streaks");
+	RNA_def_property_range(prop, 1, 16);
 	RNA_def_property_ui_text(prop, "Streaks", "Total number of streaks");
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 	
@@ -5739,7 +5739,7 @@ static void def_cmp_glare(StructRNA *srna)
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 	
 	prop = RNA_def_property(srna, "use_rotate_45", PROP_BOOLEAN, PROP_NONE);
-	RNA_def_property_boolean_sdna(prop, NULL, "angle", 0);
+	RNA_def_property_boolean_sdna(prop, NULL, "star_45", 0);
 	RNA_def_property_ui_text(prop, "Rotate 45", "Simple star filter: add 45 degree rotation offset");
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 	
diff --git a/source/blender/makesrna/intern/rna_object.c b/source/blender/makesrna/intern/rna_object.c
index 0cffba47f16..b3c166a6810 100644
--- a/source/blender/makesrna/intern/rna_object.c
+++ b/source/blender/makesrna/intern/rna_object.c
@@ -1321,8 +1321,12 @@ static void rna_Object_active_constraint_set(PointerRNA *ptr, PointerRNA value)
 
 static bConstraint *rna_Object_constraints_new(Object *object, int type)
 {
+	bConstraint *new_con = BKE_constraint_add_for_object(object, NULL, type);
+
+	ED_object_constraint_tag_update(object, new_con);
 	WM_main_add_notifier(NC_OBJECT | ND_CONSTRAINT | NA_ADDED, object);
-	return BKE_constraint_add_for_object(object, NULL, type);
+
+	return new_con;
 }
 
 static void rna_Object_constraints_remove(Object *object, ReportList *reports, PointerRNA *con_ptr)
diff --git a/source/blender/makesrna/intern/rna_object_force.c b/source/blender/makesrna/intern/rna_object_force.c
index 1d89f7535c4..514fca1b011 100644
--- a/source/blender/makesrna/intern/rna_object_force.c
+++ b/source/blender/makesrna/intern/rna_object_force.c
@@ -1275,7 +1275,7 @@ static void rna_def_field(BlenderRNA *brna)
 	prop = RNA_def_property(srna, "falloff_power", PROP_FLOAT, PROP_NONE);
 	RNA_def_property_float_sdna(prop, NULL, "f_power");
 	RNA_def_property_range(prop, 0.0f, 10.0f);
-	RNA_def_property_ui_text(prop, "Falloff Power", "Falloff power (real gravitational falloff = 2)");
+	RNA_def_property_ui_text(prop, "Falloff Power", "");
 	RNA_def_property_update(prop, 0, "rna_FieldSettings_update");
 	
 	prop = RNA_def_property(srna, "distance_min", PROP_FLOAT, PROP_NONE);
@@ -1394,6 +1394,11 @@ static void rna_def_field(BlenderRNA *brna)
 	RNA_def_property_boolean_sdna(prop, NULL, "flag", PFIELD_SMOKE_DENSITY);
 	RNA_def_property_ui_text(prop, "Apply Density", "Adjust force strength based on smoke density");
 	RNA_def_property_update(prop, 0, "rna_FieldSettings_update");
+	prop = RNA_def_property(srna, "use_gravity_falloff", PROP_BOOLEAN, PROP_NONE);
+	RNA_def_property_boolean_sdna(prop, NULL, "flag", PFIELD_GRAVITATION);
+	RNA_def_property_ui_text(prop, "Gravity Falloff", "Multiply force by 1/distance²");
+	RNA_def_property_update(prop, 0, "rna_FieldSettings_update");
+
 	
 	/* Pointer */
 	
diff --git a/source/blender/makesrna/intern/rna_pose.c b/source/blender/makesrna/intern/rna_pose.c
index 28ce63a61bd..8d161466d56 100644
--- a/source/blender/makesrna/intern/rna_pose.c
+++ b/source/blender/makesrna/intern/rna_pose.c
@@ -524,12 +524,15 @@ static void rna_PoseChannel_active_constraint_set(PointerRNA *ptr, PointerRNA va
 	BKE_constraints_active_set(&pchan->constraints, (bConstraint *)value.data);
 }
 
-static bConstraint *rna_PoseChannel_constraints_new(bPoseChannel *pchan, int type)
+static bConstraint *rna_PoseChannel_constraints_new(ID *id, bPoseChannel *pchan, Main *main, int type)
 {
-	/*WM_main_add_notifier(NC_OBJECT|ND_CONSTRAINT|NA_ADDED, object); */
-	/* TODO, pass object also */
-	/* TODO, new pose bones don't have updated draw flags */
-	return BKE_constraint_add_for_pose(NULL, pchan, NULL, type);
+	Object *ob = (Object *)id;
+	bConstraint *new_con = BKE_constraint_add_for_pose(ob, pchan, NULL, type);
+
+	ED_object_constraint_dependency_tag_update(main, ob, new_con);
+	WM_main_add_notifier(NC_OBJECT | ND_CONSTRAINT | NA_ADDED, id);
+
+	return new_con;
 }
 
 static void rna_PoseChannel_constraints_remove(ID *id, bPoseChannel *pchan, ReportList *reports, PointerRNA *con_ptr)
@@ -764,6 +767,7 @@ static void rna_def_pose_channel_constraints(BlenderRNA *brna, PropertyRNA *cpro
 	/* Constraint collection */
 	func = RNA_def_function(srna, "new", "rna_PoseChannel_constraints_new");
 	RNA_def_function_ui_description(func, "Add a constraint to this object");
+	RNA_def_function_flag(func, FUNC_USE_MAIN | FUNC_USE_SELF_ID); /* ID and Main needed for refresh */
 	/* return type */
 	parm = RNA_def_pointer(func, "constraint", "Constraint", "", "New constraint");
 	RNA_def_function_return(func, parm);
diff --git a/source/blender/makesrna/intern/rna_scene.c b/source/blender/makesrna/intern/rna_scene.c
index 1166fb89a0a..121e4f56a6e 100644
--- a/source/blender/makesrna/intern/rna_scene.c
+++ b/source/blender/makesrna/intern/rna_scene.c
@@ -411,7 +411,7 @@ EnumPropertyItem rna_enum_gpencil_interpolation_mode_items[] = {
 	/* interpolation */
 	{0, "", 0, N_("Interpolation"), "Standard transitions between keyframes"},
 	{GP_IPO_LINEAR,   "LINEAR", ICON_IPO_LINEAR, "Linear", "Straight-line interpolation between A and B (i.e. no ease in/out)"},
-	{GP_IPO_CURVEMAP, "CUSTOM", ICON_IPO_BEZIER, "Custom", "Custom interpolation defined using a curvemap"},
+	{GP_IPO_CURVEMAP, "CUSTOM", ICON_IPO_BEZIER, "Custom", "Custom interpolation defined using a curve map"},
 	
 	/* easing */
 	{0, "", 0, N_("Easing (by strength)"), "Predefined inertial transitions, useful for motion graphics (from least to most ''dramatic'')"},
@@ -792,6 +792,21 @@ static void rna_Scene_frame_current_set(PointerRNA *ptr, int value)
 	data->r.cfra = value;
 }
 
+static float rna_Scene_frame_float_get(PointerRNA *ptr)
+{
+	Scene *data = (Scene *)ptr->data;
+	return (float)data->r.cfra + data->r.subframe;
+}
+
+static void rna_Scene_frame_float_set(PointerRNA *ptr, float value)
+{
+	Scene *data = (Scene *)ptr->data;
+	/* if negative frames aren't allowed, then we can't use them */
+	FRAMENUMBER_MIN_CLAMP(value);
+	data->r.cfra = (int)value;
+	data->r.subframe = value - data->r.cfra;
+}
+
 static float rna_Scene_frame_current_final_get(PointerRNA *ptr)
 {
 	Scene *scene = (Scene *)ptr->data;
@@ -872,6 +887,12 @@ static void rna_Scene_preview_range_end_frame_set(PointerRNA *ptr, int value)
 	data->r.pefra = value;
 }
 
+static void rna_Scene_show_subframe_update(Main *UNUSED(bmain), Scene *UNUSED(current_scene), PointerRNA *ptr)
+{
+	Scene *scene = (Scene *)ptr->id.data;
+	scene->r.subframe = 0.0f;
+}
+
 static void rna_Scene_frame_update(Main *bmain, Scene *UNUSED(current_scene), PointerRNA *ptr)
 {
 	Scene *scene = (Scene *)ptr->id.data;
@@ -7081,8 +7102,19 @@ void RNA_def_scene(BlenderRNA *brna)
 	prop = RNA_def_property(srna, "frame_subframe", PROP_FLOAT, PROP_TIME);
 	RNA_def_property_float_sdna(prop, NULL, "r.subframe");
 	RNA_def_property_ui_text(prop, "Current Sub-Frame", "");
-	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE | PROP_EDITABLE);
-	
+	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
+	RNA_def_property_range(prop, 0.0f, 1.0f);
+	RNA_def_property_ui_range(prop, 0.0f, 1.0f, 0.01, 2);
+	RNA_def_property_update(prop, NC_SCENE | ND_FRAME, "rna_Scene_frame_update");
+
+	prop = RNA_def_property(srna, "frame_float", PROP_FLOAT, PROP_TIME);
+	RNA_def_property_ui_text(prop, "Current Sub-Frame", "");
+	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
+	RNA_def_property_range(prop, MINAFRAME, MAXFRAME);
+	RNA_def_property_ui_range(prop, MINAFRAME, MAXFRAME, 0.1, 2);
+	RNA_def_property_float_funcs(prop, "rna_Scene_frame_float_get", "rna_Scene_frame_float_set", NULL);
+	RNA_def_property_update(prop, NC_SCENE | ND_FRAME, "rna_Scene_frame_update");
+
 	prop = RNA_def_property(srna, "frame_start", PROP_INT, PROP_TIME);
 	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
 	RNA_def_property_int_sdna(prop, NULL, "r.sfra");
@@ -7147,7 +7179,15 @@ void RNA_def_scene(BlenderRNA *brna)
 	RNA_def_property_int_funcs(prop, NULL, "rna_Scene_preview_range_end_frame_set", NULL);
 	RNA_def_property_ui_text(prop, "Preview Range End Frame", "Alternative end frame for UI playback");
 	RNA_def_property_update(prop, NC_SCENE | ND_FRAME, NULL);
-	
+
+	/* Subframe for moblur debug. */
+	prop = RNA_def_property(srna, "show_subframe", PROP_BOOLEAN, PROP_NONE);
+	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
+	RNA_def_property_boolean_sdna(prop, NULL, "r.flag", SCER_SHOW_SUBFRAME);
+	RNA_def_property_ui_text(prop, "Show Subframe",
+	                         "Show current scene subframe and allow set it using interface tools");
+	RNA_def_property_update(prop, NC_SCENE | ND_FRAME, "rna_Scene_show_subframe_update");
+
 	/* Timeline / Time Navigation settings */
 	prop = RNA_def_property(srna, "show_keys_from_selected_only", PROP_BOOLEAN, PROP_NONE);
 	RNA_def_property_boolean_negative_sdna(prop, NULL, "flag", SCE_KEYS_NO_SELONLY);
diff --git a/source/blender/makesrna/intern/rna_smoke.c b/source/blender/makesrna/intern/rna_smoke.c
index 6db370fc152..c12937bd2bf 100644
--- a/source/blender/makesrna/intern/rna_smoke.c
+++ b/source/blender/makesrna/intern/rna_smoke.c
@@ -832,14 +832,14 @@ static void rna_def_smoke_domain_settings(BlenderRNA *brna)
 	    {FLUID_FIELD_COLOR_R, "COLOR_R", 0, "Red", "Red component of the color field"},
 	    {FLUID_FIELD_COLOR_G, "COLOR_G", 0, "Green", "Green component of the color field"},
 	    {FLUID_FIELD_COLOR_B, "COLOR_B", 0, "Blue", "Blue component of the color field"},
-		{FLUID_FIELD_DENSITY, "DENSITY", 0, "Density", "Quantity of soot in the fluid"},
+	    {FLUID_FIELD_DENSITY, "DENSITY", 0, "Density", "Quantity of soot in the fluid"},
 	    {FLUID_FIELD_FLAME, "FLAME", 0, "Flame", "Flame field"},
 	    {FLUID_FIELD_FUEL, "FUEL", 0, "Fuel", "Fuel field"},
 	    {FLUID_FIELD_HEAT, "HEAT", 0, "Heat", "Temperature of the fluid"},
 	    {FLUID_FIELD_VELOCITY_X, "VELOCITY_X", 0, "X Velocity", "X component of the velocity field"},
 	    {FLUID_FIELD_VELOCITY_Y, "VELOCITY_Y", 0, "Y Velocity", "Y component of the velocity field"},
 	    {FLUID_FIELD_VELOCITY_Z, "VELOCITY_Z", 0, "Z Velocity", "Z component of the velocity field"},
-		{0, NULL, 0, NULL, NULL}
+	    {0, NULL, 0, NULL, NULL}
 	};
 
 	prop = RNA_def_property(srna, "coba_field", PROP_ENUM, PROP_NONE);
diff --git a/source/blender/makesrna/intern/rna_userdef.c b/source/blender/makesrna/intern/rna_userdef.c
index a75e3c13fce..6927abcb4f8 100644
--- a/source/blender/makesrna/intern/rna_userdef.c
+++ b/source/blender/makesrna/intern/rna_userdef.c
@@ -4192,6 +4192,10 @@ static void rna_def_userdef_system(BlenderRNA *brna)
 	RNA_def_property_ui_text(prop, "Selection Method",
 	                         "Use OpenGL occlusion queries or selection render mode to accelerate selection");
 
+	prop = RNA_def_property(srna, "use_select_pick_depth", PROP_BOOLEAN, PROP_NONE);
+	RNA_def_property_boolean_sdna(prop, NULL, "gpu_select_pick_deph", 1);
+	RNA_def_property_ui_text(prop, "OpenGL Depth Picking", "Use the depth buffer for picking 3D View selection");
+
 	/* Full scene anti-aliasing */
 	prop = RNA_def_property(srna, "multi_sample", PROP_ENUM, PROP_NONE);
 	RNA_def_property_enum_bitflag_sdna(prop, NULL, "ogl_multisamples");
diff --git a/source/blender/modifiers/CMakeLists.txt b/source/blender/modifiers/CMakeLists.txt
index bacfc177432..ad2b862141c 100644
--- a/source/blender/modifiers/CMakeLists.txt
+++ b/source/blender/modifiers/CMakeLists.txt
@@ -93,6 +93,7 @@ set(SRC
 	intern/MOD_solidify.c
 	intern/MOD_subsurf.c
 	intern/MOD_surface.c
+	intern/MOD_surfacedeform.c
 	intern/MOD_triangulate.c
 	intern/MOD_util.c
 	intern/MOD_uvwarp.c
diff --git a/source/blender/modifiers/MOD_modifiertypes.h b/source/blender/modifiers/MOD_modifiertypes.h
index 4c881445893..bf121af2bd1 100644
--- a/source/blender/modifiers/MOD_modifiertypes.h
+++ b/source/blender/modifiers/MOD_modifiertypes.h
@@ -85,6 +85,7 @@ extern ModifierTypeInfo modifierType_DataTransfer;
 extern ModifierTypeInfo modifierType_NormalEdit;
 extern ModifierTypeInfo modifierType_CorrectiveSmooth;
 extern ModifierTypeInfo modifierType_MeshSequenceCache;
+extern ModifierTypeInfo modifierType_SurfaceDeform;
 
 /* MOD_util.c */
 void modifier_type_init(ModifierTypeInfo *types[]);
diff --git a/source/blender/modifiers/intern/MOD_boolean.c b/source/blender/modifiers/intern/MOD_boolean.c
index f828bc68857..f86d8b99f3c 100644
--- a/source/blender/modifiers/intern/MOD_boolean.c
+++ b/source/blender/modifiers/intern/MOD_boolean.c
@@ -319,6 +319,7 @@ static DerivedMesh *applyModifier_bmesh(
 				        use_separate,
 				        use_dissolve,
 				        use_island_connect,
+				        false,
 				        bmd->operation,
 				        bmd->double_threshold);
 
diff --git a/source/blender/modifiers/intern/MOD_displace.c b/source/blender/modifiers/intern/MOD_displace.c
index f6acbef96e9..18f60bab490 100644
--- a/source/blender/modifiers/intern/MOD_displace.c
+++ b/source/blender/modifiers/intern/MOD_displace.c
@@ -325,7 +325,7 @@ static void displaceModifier_do(
 	float (*tex_co)[3];
 	float weight = 1.0f; /* init value unused but some compilers may complain */
 	float (*vert_clnors)[3] = NULL;
-	float local_mat[4][4] = {0};
+	float local_mat[4][4] = {{0}};
 	const bool use_global_direction = dmd->space == MOD_DISP_SPACE_GLOBAL;
 
 	if (!dmd->texture && dmd->direction == MOD_DISP_DIR_RGB_XYZ) return;
diff --git a/source/blender/modifiers/intern/MOD_dynamicpaint.c b/source/blender/modifiers/intern/MOD_dynamicpaint.c
index 05068b9b597..bb75d655802 100644
--- a/source/blender/modifiers/intern/MOD_dynamicpaint.c
+++ b/source/blender/modifiers/intern/MOD_dynamicpaint.c
@@ -116,7 +116,7 @@ static DerivedMesh *applyModifier(ModifierData *md, Object *ob,
 
 static bool is_brush_cb(Object *UNUSED(ob), ModifierData *pmd)
 {
-	return ((DynamicPaintModifierData*)pmd)->brush != NULL;
+	return ((DynamicPaintModifierData *)pmd)->brush != NULL;
 }
 
 static void updateDepgraph(ModifierData *md, DagForest *forest,
diff --git a/source/blender/modifiers/intern/MOD_surfacedeform.c b/source/blender/modifiers/intern/MOD_surfacedeform.c
new file mode 100644
index 00000000000..a999d7629af
--- /dev/null
+++ b/source/blender/modifiers/intern/MOD_surfacedeform.c
@@ -0,0 +1,1226 @@
+#include "DNA_object_types.h"
+#include "DNA_scene_types.h"
+
+#include "BLI_alloca.h"
+#include "BLI_math.h"
+#include "BLI_math_geom.h"
+#include "BLI_task.h"
+
+#include "BKE_cdderivedmesh.h"
+#include "BKE_editmesh.h"
+#include "BKE_library_query.h"
+#include "BKE_modifier.h"
+
+#include "depsgraph_private.h"
+
+#include "MEM_guardedalloc.h"
+
+#include "MOD_util.h"
+
+typedef struct SDefAdjacency {
+	struct SDefAdjacency *next;
+	unsigned int index;
+} SDefAdjacency;
+
+typedef struct SDefAdjacencyArray {
+	SDefAdjacency *first;
+	unsigned int num; /* Careful, this is twice the number of polygons (avoids an extra loop) */
+} SDefAdjacencyArray;
+
+typedef struct SDefEdgePolys {
+	unsigned int polys[2], num;
+} SDefEdgePolys;
+
+typedef struct SDefBindCalcData {
+	BVHTreeFromMesh * const treeData;
+	const SDefAdjacencyArray * const vert_edges;
+	const SDefEdgePolys * const edge_polys;
+	SDefVert * const bind_verts;
+	const MLoopTri * const looptri;
+	const MPoly * const mpoly;
+	const MEdge * const medge;
+	const MLoop * const mloop;
+	float (* const targetCos)[3];
+	float (* const vertexCos)[3];
+	float imat[4][4];
+	const float falloff;
+	int success;
+} SDefBindCalcData;
+
+typedef struct SDefBindPoly {
+	float (*coords)[3];
+	float (*coords_v2)[2];
+	float point_v2[2];
+	float weight_angular;
+	float weight_dist_proj;
+	float weight_dist;
+	float weight;
+	float scales[2];
+	float centroid[3];
+	float centroid_v2[2];
+	float normal[3];
+	float cent_edgemid_vecs_v2[2][2];
+	float edgemid_angle;
+	float point_edgemid_angles[2];
+	float corner_edgemid_angles[2];
+	float dominant_angle_weight;
+	unsigned int index;
+	unsigned int numverts;
+	unsigned int loopstart;
+	unsigned int edge_inds[2];
+	unsigned int edge_vert_inds[2];
+	unsigned int corner_ind;
+	unsigned int dominant_edge;
+	bool inside;
+} SDefBindPoly;
+
+typedef struct SDefBindWeightData {
+	SDefBindPoly *bind_polys;
+	unsigned int numpoly;
+	unsigned int numbinds;
+} SDefBindWeightData;
+
+typedef struct SDefDeformData {
+	const SDefVert * const bind_verts;
+	float (* const targetCos)[3];
+	float (* const vertexCos)[3];
+} SDefDeformData;
+
+/* Bind result values */
+enum {
+	MOD_SDEF_BIND_RESULT_SUCCESS = 1,
+	MOD_SDEF_BIND_RESULT_GENERIC_ERR = 0,
+	MOD_SDEF_BIND_RESULT_MEM_ERR = -1,
+	MOD_SDEF_BIND_RESULT_NONMANY_ERR = -2,
+	MOD_SDEF_BIND_RESULT_CONCAVE_ERR = -3,
+	MOD_SDEF_BIND_RESULT_OVERLAP_ERR = -4,
+};
+
+/* Infinite weight flags */
+enum {
+	MOD_SDEF_INFINITE_WEIGHT_ANGULAR = (1 << 0),
+	MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ = (1 << 1),
+	MOD_SDEF_INFINITE_WEIGHT_DIST = (1 << 2),
+};
+
+static void initData(ModifierData *md)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+	smd->target = NULL;
+	smd->verts = NULL;
+	smd->flags = 0;
+	smd->falloff = 4.0f;
+}
+
+static void freeData(ModifierData *md)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+	if (smd->verts) {
+		for (int i = 0; i < smd->numverts; i++) {
+			if (smd->verts[i].binds) {
+				for (int j = 0; j < smd->verts[i].numbinds; j++) {
+					MEM_SAFE_FREE(smd->verts[i].binds[j].vert_inds);
+					MEM_SAFE_FREE(smd->verts[i].binds[j].vert_weights);
+				}
+
+				MEM_freeN(smd->verts[i].binds);
+			}
+		}
+
+		MEM_freeN(smd->verts);
+		smd->verts = NULL;
+	}
+}
+
+static void copyData(ModifierData *md, ModifierData *target)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+	SurfaceDeformModifierData *tsmd = (SurfaceDeformModifierData *)target;
+
+	*tsmd = *smd;
+
+	if (smd->verts) {
+		tsmd->verts = MEM_dupallocN(smd->verts);
+
+		for (int i = 0; i < smd->numverts; i++) {
+			if (smd->verts[i].binds) {
+				tsmd->verts[i].binds = MEM_dupallocN(smd->verts[i].binds);
+
+				for (int j = 0; j < smd->verts[i].numbinds; j++) {
+					if (smd->verts[i].binds[j].vert_inds) {
+						tsmd->verts[i].binds[j].vert_inds = MEM_dupallocN(smd->verts[i].binds[j].vert_inds);
+					}
+
+					if (smd->verts[i].binds[j].vert_weights) {
+						tsmd->verts[i].binds[j].vert_weights = MEM_dupallocN(smd->verts[i].binds[j].vert_weights);
+					}
+				}
+			}
+		}
+	}
+}
+
+static void foreachObjectLink(ModifierData *md, Object *ob, ObjectWalkFunc walk, void *userData)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+	walk(userData, ob, &smd->target, IDWALK_NOP);
+}
+
+static void updateDepgraph(ModifierData *md, DagForest *forest,
+                           struct Main *UNUSED(bmain),
+                           struct Scene *UNUSED(scene),
+                           Object *UNUSED(ob),
+                           DagNode *obNode)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+	if (smd->target) {
+		DagNode *curNode = dag_get_node(forest, smd->target);
+
+		dag_add_relation(forest, curNode, obNode, DAG_RL_DATA_DATA, "Surface Deform Modifier");
+	}
+}
+
+static void updateDepsgraph(ModifierData *md,
+                            struct Main *UNUSED(bmain),
+                            struct Scene *UNUSED(scene),
+                            Object *UNUSED(ob),
+                            struct DepsNodeHandle *node)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+	if (smd->target != NULL) {
+		DEG_add_object_relation(node, smd->target, DEG_OB_COMP_GEOMETRY, "Surface Deform Modifier");
+	}
+}
+
+static void freeAdjacencyMap(SDefAdjacencyArray * const vert_edges, SDefAdjacency * const adj_ref, SDefEdgePolys * const edge_polys)
+{
+	MEM_freeN(edge_polys);
+
+	MEM_freeN(adj_ref);
+
+	MEM_freeN(vert_edges);
+}
+
+static int buildAdjacencyMap(const MPoly *poly, const MEdge *edge, const MLoop * const mloop, const unsigned int numpoly, const unsigned int numedges,
+                              SDefAdjacencyArray * const vert_edges, SDefAdjacency *adj, SDefEdgePolys * const edge_polys)
+{
+	const MLoop *loop;
+
+	/* Fing polygons adjacent to edges */
+	for (int i = 0; i < numpoly; i++, poly++) {
+		loop = &mloop[poly->loopstart];
+
+		for (int j = 0; j < poly->totloop; j++, loop++) {
+			if (edge_polys[loop->e].num == 0) {
+				edge_polys[loop->e].polys[0] = i;
+				edge_polys[loop->e].polys[1] = -1;
+				edge_polys[loop->e].num++;
+			}
+			else if (edge_polys[loop->e].num == 1) {
+				edge_polys[loop->e].polys[1] = i;
+				edge_polys[loop->e].num++;
+			}
+			else {
+				return MOD_SDEF_BIND_RESULT_NONMANY_ERR;
+			}
+		}
+	}
+
+	/* Find edges adjacent to vertices */
+	for (int i = 0; i < numedges; i++, edge++) {
+		adj->next = vert_edges[edge->v1].first;
+		adj->index = i;
+		vert_edges[edge->v1].first = adj;
+		vert_edges[edge->v1].num += edge_polys[i].num;
+		adj++;
+
+		adj->next = vert_edges[edge->v2].first;
+		adj->index = i;
+		vert_edges[edge->v2].first = adj;
+		vert_edges[edge->v2].num += edge_polys[i].num;
+		adj++;
+	}
+
+	return MOD_SDEF_BIND_RESULT_SUCCESS;
+}
+
+BLI_INLINE void sortPolyVertsEdge(unsigned int *indices, const MLoop * const mloop, const unsigned int edge, const unsigned int num)
+{
+	bool found = false;
+
+	for (int i = 0; i < num; i++) {
+		if (mloop[i].e == edge) {
+			found = true;
+		}
+		if (found) {
+			*indices = mloop[i].v;
+			indices++;
+		}
+	}
+
+	/* Fill in remaining vertex indices that occur before the edge */
+	for (int i = 0; mloop[i].e != edge; i++) {
+		*indices = mloop[i].v;
+		indices++;
+	}
+}
+
+BLI_INLINE void sortPolyVertsTri(unsigned int *indices, const MLoop * const mloop, const unsigned int loopstart, const unsigned int num)
+{
+	for (int i = loopstart; i < num; i++) {
+		*indices = mloop[i].v;
+		indices++;
+	}
+
+	for (int i = 0; i < loopstart; i++) {
+		*indices = mloop[i].v;
+		indices++;
+	}
+}
+
+BLI_INLINE unsigned int nearestVert(SDefBindCalcData * const data, const float point_co[3])
+{
+	BVHTreeNearest nearest = {.dist_sq = FLT_MAX, .index = -1};
+	const MPoly *poly;
+	const MEdge *edge;
+	const MLoop *loop;
+	float t_point[3];
+	float max_dist = FLT_MAX;
+	float dist;
+	unsigned int index = 0;
+
+	mul_v3_m4v3(t_point, data->imat, point_co);
+
+	BLI_bvhtree_find_nearest(data->treeData->tree, t_point, &nearest, data->treeData->nearest_callback, data->treeData);
+
+	poly = &data->mpoly[data->looptri[nearest.index].poly];
+	loop = &data->mloop[poly->loopstart];
+
+	for (int i = 0; i < poly->totloop; i++, loop++) {
+		edge = &data->medge[loop->e];
+		dist = dist_squared_to_line_segment_v3(point_co, data->targetCos[edge->v1], data->targetCos[edge->v2]);
+
+		if (dist < max_dist) {
+			max_dist = dist;
+			index = loop->e;
+		}
+	}
+
+	edge = &data->medge[index];
+	if (len_squared_v3v3(point_co, data->targetCos[edge->v1]) < len_squared_v3v3(point_co, data->targetCos[edge->v2])) {
+		return edge->v1;
+	}
+	else {
+		return edge->v2;
+	}
+}
+
+BLI_INLINE int isPolyValid(const float coords[][2], const unsigned int nr)
+{
+	float prev_co[2];
+	float curr_vec[2], prev_vec[2];
+
+	if (!is_poly_convex_v2(coords, nr)) {
+		return MOD_SDEF_BIND_RESULT_CONCAVE_ERR;
+	}
+
+	copy_v2_v2(prev_co, coords[nr - 1]);
+	sub_v2_v2v2(prev_vec, prev_co, coords[nr - 2]);
+
+	for (int i = 0; i < nr; i++) {
+		sub_v2_v2v2(curr_vec, coords[i], prev_co);
+
+		if (len_squared_v2(curr_vec) < FLT_EPSILON) {
+			return MOD_SDEF_BIND_RESULT_OVERLAP_ERR;
+		}
+
+		if (1.0f - dot_v2v2(prev_vec, curr_vec) < FLT_EPSILON) {
+			return MOD_SDEF_BIND_RESULT_CONCAVE_ERR;
+		}
+
+		copy_v2_v2(prev_co, coords[i]);
+		copy_v2_v2(prev_vec, curr_vec);
+	}
+
+	return MOD_SDEF_BIND_RESULT_SUCCESS;
+}
+
+static void freeBindData(SDefBindWeightData * const bwdata)
+{
+	SDefBindPoly *bpoly = bwdata->bind_polys;
+
+	if (bwdata->bind_polys) {
+		for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+			MEM_SAFE_FREE(bpoly->coords);
+			MEM_SAFE_FREE(bpoly->coords_v2);
+		}
+
+		MEM_freeN(bwdata->bind_polys);
+	}
+
+	MEM_freeN(bwdata);
+}
+
+BLI_INLINE float computeAngularWeight(const float point_angle, const float edgemid_angle)
+{
+	float weight;
+
+	weight = point_angle;
+	weight /= edgemid_angle;
+	weight *= M_PI_2;
+
+	return sinf(weight);
+}
+
+BLI_INLINE SDefBindWeightData *computeBindWeights(SDefBindCalcData * const data, const float point_co[3])
+{
+	const unsigned int nearest = nearestVert(data, point_co);
+	const SDefAdjacency * const vert_edges = data->vert_edges[nearest].first;
+	const SDefEdgePolys * const edge_polys = data->edge_polys;
+
+	const SDefAdjacency *vedge;
+	const MPoly *poly;
+	const MLoop *loop;
+
+	SDefBindWeightData *bwdata;
+	SDefBindPoly *bpoly;
+
+	float world[3] = {0.0f, 0.0f, 1.0f};
+	float avg_point_dist = 0.0f;
+	float tot_weight = 0.0f;
+	int inf_weight_flags = 0;
+
+	bwdata = MEM_callocN(sizeof(*bwdata), "SDefBindWeightData");
+	if (bwdata == NULL) {
+		data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+		return NULL;
+	}
+
+	bwdata->numpoly = data->vert_edges[nearest].num / 2;
+
+	bpoly = MEM_callocN(sizeof(*bpoly) * bwdata->numpoly, "SDefBindPoly");
+	if (bpoly == NULL) {
+		freeBindData(bwdata);
+		data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+		return NULL;
+	}
+
+	bwdata->bind_polys = bpoly;
+
+	/* Loop over all adjacent edges, and build the SDefBindPoly data for each poly adjacent to those */
+	for (vedge = vert_edges; vedge; vedge = vedge->next) {
+		unsigned int edge_ind = vedge->index;
+
+		for (int i = 0; i < edge_polys[edge_ind].num; i++) {
+			{
+				bpoly = bwdata->bind_polys;
+
+				for (int j = 0; j < bwdata->numpoly; bpoly++, j++) {
+					/* If coords isn't allocated, we have reached the first uninitialized bpoly */
+					if ((bpoly->index == edge_polys[edge_ind].polys[i]) || (!bpoly->coords)) {
+						break;
+					}
+				}
+			}
+
+			/* Check if poly was already created by another edge or still has to be initialized */
+			if (!bpoly->coords) {
+				float angle;
+				float axis[3];
+				float tmp_vec_v2[2];
+				int is_poly_valid;
+
+				bpoly->index = edge_polys[edge_ind].polys[i];
+				bpoly->coords = NULL;
+				bpoly->coords_v2 = NULL;
+
+				/* Copy poly data */
+				poly = &data->mpoly[bpoly->index];
+				loop = &data->mloop[poly->loopstart];
+
+				bpoly->numverts = poly->totloop;
+				bpoly->loopstart = poly->loopstart;
+
+				bpoly->coords = MEM_mallocN(sizeof(*bpoly->coords) * poly->totloop, "SDefBindPolyCoords");
+				if (bpoly->coords == NULL) {
+					freeBindData(bwdata);
+					data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+					return NULL;
+				}
+
+				bpoly->coords_v2 = MEM_mallocN(sizeof(*bpoly->coords_v2) * poly->totloop, "SDefBindPolyCoords_v2");
+				if (bpoly->coords_v2 == NULL) {
+					freeBindData(bwdata);
+					data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+					return NULL;
+				}
+
+				for (int j = 0; j < poly->totloop; j++, loop++) {
+					copy_v3_v3(bpoly->coords[j], data->targetCos[loop->v]);
+
+					/* Find corner and edge indices within poly loop array */
+					if (loop->v == nearest) {
+						bpoly->corner_ind = j;
+						bpoly->edge_vert_inds[0] = (j == 0) ? (poly->totloop - 1) : (j - 1);
+						bpoly->edge_vert_inds[1] = (j == poly->totloop - 1) ? (0) : (j + 1);
+
+						bpoly->edge_inds[0] = data->mloop[poly->loopstart + bpoly->edge_vert_inds[0]].e;
+						bpoly->edge_inds[1] = loop->e;
+					}
+				}
+
+				/* Compute poly's parametric data */
+				mid_v3_v3_array(bpoly->centroid, bpoly->coords, poly->totloop);
+				normal_poly_v3(bpoly->normal, bpoly->coords, poly->totloop);
+
+				/* Compute poly skew angle and axis */
+				angle = angle_normalized_v3v3(bpoly->normal, world);
+
+				cross_v3_v3v3(axis, bpoly->normal, world);
+				normalize_v3(axis);
+
+				/* Map coords onto 2d normal plane */
+				map_to_plane_axis_angle_v2_v3v3fl(bpoly->point_v2, point_co, axis, angle);
+
+				zero_v2(bpoly->centroid_v2);
+				for (int j = 0; j < poly->totloop; j++) {
+					map_to_plane_axis_angle_v2_v3v3fl(bpoly->coords_v2[j], bpoly->coords[j], axis, angle);
+					madd_v2_v2fl(bpoly->centroid_v2, bpoly->coords_v2[j], 1.0f / poly->totloop);
+				}
+
+				is_poly_valid = isPolyValid(bpoly->coords_v2, poly->totloop);
+
+				if (is_poly_valid != MOD_SDEF_BIND_RESULT_SUCCESS) {
+					freeBindData(bwdata);
+					data->success = is_poly_valid;
+					return NULL;
+				}
+
+				bpoly->inside = isect_point_poly_v2(bpoly->point_v2, bpoly->coords_v2, poly->totloop, false);
+
+				/* Initialize weight components */
+				bpoly->weight_angular = 1.0f;
+				bpoly->weight_dist_proj = len_v2v2(bpoly->centroid_v2, bpoly->point_v2);
+				bpoly->weight_dist = len_v3v3(bpoly->centroid, point_co);
+
+				avg_point_dist += bpoly->weight_dist;
+
+				/* Compute centroid to mid-edge vectors */
+				mid_v2_v2v2(bpoly->cent_edgemid_vecs_v2[0],
+				            bpoly->coords_v2[bpoly->edge_vert_inds[0]],
+				            bpoly->coords_v2[bpoly->corner_ind]);
+
+				mid_v2_v2v2(bpoly->cent_edgemid_vecs_v2[1],
+				            bpoly->coords_v2[bpoly->edge_vert_inds[1]],
+				            bpoly->coords_v2[bpoly->corner_ind]);
+
+				sub_v2_v2(bpoly->cent_edgemid_vecs_v2[0], bpoly->centroid_v2);
+				sub_v2_v2(bpoly->cent_edgemid_vecs_v2[1], bpoly->centroid_v2);
+
+				/* Compute poly scales with respect to mid-edges, and normalize the vectors */
+				bpoly->scales[0] = normalize_v2(bpoly->cent_edgemid_vecs_v2[0]);
+				bpoly->scales[1] = normalize_v2(bpoly->cent_edgemid_vecs_v2[1]);
+
+				/* Compute the required polygon angles */
+				bpoly->edgemid_angle = angle_normalized_v2v2(bpoly->cent_edgemid_vecs_v2[0], bpoly->cent_edgemid_vecs_v2[1]);
+
+				sub_v2_v2v2(tmp_vec_v2, bpoly->coords_v2[bpoly->corner_ind], bpoly->centroid_v2);
+				normalize_v2(tmp_vec_v2);
+
+				bpoly->corner_edgemid_angles[0] = angle_normalized_v2v2(tmp_vec_v2, bpoly->cent_edgemid_vecs_v2[0]);
+				bpoly->corner_edgemid_angles[1] = angle_normalized_v2v2(tmp_vec_v2, bpoly->cent_edgemid_vecs_v2[1]);
+
+				/* Check for inifnite weights, and compute angular data otherwise */
+				if (bpoly->weight_dist < FLT_EPSILON) {
+					inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ;
+					inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST;
+				}
+				else if (bpoly->weight_dist_proj < FLT_EPSILON) {
+					inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ;
+				}
+				else {
+					float cent_point_vec[2];
+
+					sub_v2_v2v2(cent_point_vec, bpoly->point_v2, bpoly->centroid_v2);
+					normalize_v2(cent_point_vec);
+
+					bpoly->point_edgemid_angles[0] = angle_normalized_v2v2(cent_point_vec, bpoly->cent_edgemid_vecs_v2[0]);
+					bpoly->point_edgemid_angles[1] = angle_normalized_v2v2(cent_point_vec, bpoly->cent_edgemid_vecs_v2[1]);
+				}
+			}
+		}
+	}
+
+	avg_point_dist /= bwdata->numpoly;
+
+	/* If weights 1 and 2 are not infinite, loop over all adjacent edges again,
+	 * and build adjacency dependent angle data (depends on all polygons having been computed) */
+	if (!inf_weight_flags) {
+		for (vedge = vert_edges; vedge; vedge = vedge->next) {
+			SDefBindPoly *bpolys[2];
+			const SDefEdgePolys *epolys;
+			float ang_weights[2];
+			unsigned int edge_ind = vedge->index;
+			unsigned int edge_on_poly[2];
+
+			epolys = &edge_polys[edge_ind];
+
+			/* Find bind polys corresponding to the edge's adjacent polys */
+			bpoly = bwdata->bind_polys;
+
+			for (int i = 0, j = 0; (i < bwdata->numpoly) && (j < epolys->num); bpoly++, i++) {
+				if (ELEM(bpoly->index, epolys->polys[0], epolys->polys[1])) {
+					bpolys[j] = bpoly;
+
+					if (bpoly->edge_inds[0] == edge_ind) {
+						edge_on_poly[j] = 0;
+					}
+					else {
+						edge_on_poly[j] = 1;
+					}
+
+					j++;
+				}
+			}
+
+			/* Compute angular weight component */
+			if (epolys->num == 1) {
+				ang_weights[0] = computeAngularWeight(bpolys[0]->point_edgemid_angles[edge_on_poly[0]], bpolys[0]->edgemid_angle);
+				bpolys[0]->weight_angular *= ang_weights[0] * ang_weights[0];
+			}
+			else if (epolys->num == 2) {
+				ang_weights[0] = computeAngularWeight(bpolys[0]->point_edgemid_angles[edge_on_poly[0]], bpolys[0]->edgemid_angle);
+				ang_weights[1] = computeAngularWeight(bpolys[1]->point_edgemid_angles[edge_on_poly[1]], bpolys[1]->edgemid_angle);
+
+				bpolys[0]->weight_angular *= ang_weights[0] * ang_weights[1];
+				bpolys[1]->weight_angular *= ang_weights[0] * ang_weights[1];
+			}
+		}
+	}
+
+	/* Compute scalings and falloff.
+	 * Scale all weights if no infinite weight is found,
+	 * scale only unprojected weight if projected weight is infinite,
+	 * scale none if both are infinite. */
+	if (!inf_weight_flags) {
+		bpoly = bwdata->bind_polys;
+
+		for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+			float corner_angle_weights[2];
+			float scale_weight, sqr, inv_sqr;
+
+			corner_angle_weights[0] = bpoly->point_edgemid_angles[0] / bpoly->corner_edgemid_angles[0];
+			corner_angle_weights[1] = bpoly->point_edgemid_angles[1] / bpoly->corner_edgemid_angles[1];
+
+			if (isnan(corner_angle_weights[0]) || isnan(corner_angle_weights[1])) {
+				freeBindData(bwdata);
+				data->success = MOD_SDEF_BIND_RESULT_GENERIC_ERR;
+				return NULL;
+			}
+
+			/* Find which edge the point is closer to */
+			if (corner_angle_weights[0] < corner_angle_weights[1]) {
+				bpoly->dominant_edge = 0;
+				bpoly->dominant_angle_weight = corner_angle_weights[0];
+			}
+			else {
+				bpoly->dominant_edge = 1;
+				bpoly->dominant_angle_weight = corner_angle_weights[1];
+			}
+
+			bpoly->dominant_angle_weight = sinf(bpoly->dominant_angle_weight * M_PI_2);
+
+			/* Compute quadratic angular scale interpolation weight */
+			scale_weight = bpoly->point_edgemid_angles[bpoly->dominant_edge] / bpoly->edgemid_angle;
+			scale_weight /= scale_weight + (bpoly->point_edgemid_angles[!bpoly->dominant_edge] / bpoly->edgemid_angle);
+
+			sqr = scale_weight * scale_weight;
+			inv_sqr = 1.0f - scale_weight;
+			inv_sqr *= inv_sqr;
+			scale_weight = sqr / (sqr + inv_sqr);
+
+			/* Compute interpolated scale (no longer need the individual scales,
+			 * so simply storing the result over the scale in index zero) */
+			bpoly->scales[0] = bpoly->scales[bpoly->dominant_edge] * (1.0f - scale_weight) +
+			                   bpoly->scales[!bpoly->dominant_edge] * scale_weight;
+
+			/* Scale the point distance weights, and introduce falloff */
+			bpoly->weight_dist_proj /= bpoly->scales[0];
+			bpoly->weight_dist_proj = powf(bpoly->weight_dist_proj, data->falloff);
+
+			bpoly->weight_dist /= avg_point_dist;
+			bpoly->weight_dist = powf(bpoly->weight_dist, data->falloff);
+
+			/* Re-check for infinite weights, now that all scalings and interpolations are computed */
+			if (bpoly->weight_dist < FLT_EPSILON) {
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ;
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST;
+			}
+			else if (bpoly->weight_dist_proj < FLT_EPSILON) {
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ;
+			}
+			else if (bpoly->weight_angular < FLT_EPSILON) {
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_ANGULAR;
+			}
+		}
+	}
+	else if (!(inf_weight_flags & MOD_SDEF_INFINITE_WEIGHT_DIST)) {
+		bpoly = bwdata->bind_polys;
+
+		for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+			/* Scale the point distance weight by average point distance, and introduce falloff */
+			bpoly->weight_dist /= avg_point_dist;
+			bpoly->weight_dist = powf(bpoly->weight_dist, data->falloff);
+
+			/* Re-check for infinite weights, now that all scalings and interpolations are computed */
+			if (bpoly->weight_dist < FLT_EPSILON) {
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST;
+			}
+		}
+	}
+
+	/* Final loop, to compute actual weights */
+	bpoly = bwdata->bind_polys;
+
+	for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+		/* Weight computation from components */
+		if (inf_weight_flags & MOD_SDEF_INFINITE_WEIGHT_DIST) {
+			bpoly->weight = bpoly->weight_dist < FLT_EPSILON ? 1.0f : 0.0f;
+		}
+		else if (inf_weight_flags & MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ) {
+			bpoly->weight = bpoly->weight_dist_proj < FLT_EPSILON ?
+			                1.0f / bpoly->weight_dist : 0.0f;
+		}
+		else if (inf_weight_flags & MOD_SDEF_INFINITE_WEIGHT_ANGULAR) {
+			bpoly->weight = bpoly->weight_angular < FLT_EPSILON ?
+			                1.0f / bpoly->weight_dist_proj / bpoly->weight_dist : 0.0f;
+		}
+		else {
+			bpoly->weight = 1.0f / bpoly->weight_angular /
+			                       bpoly->weight_dist_proj /
+			                       bpoly->weight_dist;
+		}
+
+		tot_weight += bpoly->weight;
+	}
+
+	bpoly = bwdata->bind_polys;
+
+	for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+		bpoly->weight /= tot_weight;
+
+		/* Evaluate if this poly is relevant to bind */
+		/* Even though the weights should add up to 1.0,
+		 * the losses of weights smaller than epsilon here
+		 * should be negligible... */
+		if (bpoly->weight >= FLT_EPSILON) {
+			if (bpoly->inside) {
+				bwdata->numbinds += 1;
+			}
+			else {
+				if (bpoly->dominant_angle_weight < FLT_EPSILON || 1.0f - bpoly->dominant_angle_weight < FLT_EPSILON) {
+					bwdata->numbinds += 1;
+				}
+				else {
+					bwdata->numbinds += 2;
+				}
+			}
+		}
+	}
+
+	return bwdata;
+}
+
+BLI_INLINE float computeNormalDisplacement(const float point_co[3], const float point_co_proj[3], const float normal[3])
+{
+	float disp_vec[3];
+	float normal_dist;
+
+	sub_v3_v3v3(disp_vec, point_co, point_co_proj);
+	normal_dist = len_v3(disp_vec);
+
+	if (dot_v3v3(disp_vec, normal) < 0) {
+		normal_dist *= -1;
+	}
+
+	return normal_dist;
+}
+
+static void bindVert(void *userdata, void *UNUSED(userdata_chunk), const int index, const int UNUSED(threadid))
+{
+	SDefBindCalcData * const data = (SDefBindCalcData *)userdata;
+	float point_co[3];
+	float point_co_proj[3];
+
+	SDefBindWeightData *bwdata;
+	SDefVert *sdvert = data->bind_verts + index;
+	SDefBindPoly *bpoly;
+	SDefBind *sdbind;
+
+	if (data->success != MOD_SDEF_BIND_RESULT_SUCCESS) {
+		sdvert->binds = NULL;
+		sdvert->numbinds = 0;
+		return;
+	}
+
+	copy_v3_v3(point_co, data->vertexCos[index]);
+	bwdata = computeBindWeights(data, point_co);
+
+	if (bwdata == NULL) {
+		sdvert->binds = NULL;
+		sdvert->numbinds = 0;
+		return;
+	}
+
+	sdvert->binds = MEM_callocN(sizeof(*sdvert->binds) * bwdata->numbinds, "SDefVertBindData");
+	if (sdvert->binds == NULL) {
+		data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+		sdvert->numbinds = 0;
+		return;
+	}
+
+	sdvert->numbinds = bwdata->numbinds;
+
+	sdbind = sdvert->binds;
+
+	bpoly = bwdata->bind_polys;
+
+	for (int i = 0; i < bwdata->numbinds; bpoly++) {
+		if (bpoly->weight >= FLT_EPSILON) {
+			if (bpoly->inside) {
+				const MLoop *loop = &data->mloop[bpoly->loopstart];
+
+				sdbind->influence = bpoly->weight;
+				sdbind->numverts = bpoly->numverts;
+
+				sdbind->mode = MOD_SDEF_MODE_NGON;
+				sdbind->vert_weights = MEM_mallocN(sizeof(*sdbind->vert_weights) * bpoly->numverts, "SDefNgonVertWeights");
+				if (sdbind->vert_weights == NULL) {
+					data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+					return;
+				}
+
+				sdbind->vert_inds = MEM_mallocN(sizeof(*sdbind->vert_inds) * bpoly->numverts, "SDefNgonVertInds");
+				if (sdbind->vert_inds == NULL) {
+					data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+					return;
+				}
+
+				interp_weights_poly_v2(sdbind->vert_weights, bpoly->coords_v2, bpoly->numverts, bpoly->point_v2);
+
+				/* Reproject vert based on weights and original poly verts, to reintroduce poly non-planarity */
+				zero_v3(point_co_proj);
+				for (int j = 0; j < bpoly->numverts; j++, loop++) {
+					madd_v3_v3fl(point_co_proj, bpoly->coords[j], sdbind->vert_weights[j]);
+					sdbind->vert_inds[j] = loop->v;
+				}
+
+				sdbind->normal_dist = computeNormalDisplacement(point_co, point_co_proj, bpoly->normal);
+
+				sdbind++;
+				i++;
+			}
+			else {
+				float tmp_vec[3];
+				float cent[3], norm[3];
+				float v1[3], v2[3], v3[3];
+
+				if (1.0f - bpoly->dominant_angle_weight >= FLT_EPSILON) {
+					sdbind->influence = bpoly->weight * (1.0f - bpoly->dominant_angle_weight);
+					sdbind->numverts = bpoly->numverts;
+
+					sdbind->mode = MOD_SDEF_MODE_CENTROID;
+					sdbind->vert_weights = MEM_mallocN(sizeof(*sdbind->vert_weights) * 3, "SDefCentVertWeights");
+					if (sdbind->vert_weights == NULL) {
+						data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+						return;
+					}
+
+					sdbind->vert_inds = MEM_mallocN(sizeof(*sdbind->vert_inds) * bpoly->numverts, "SDefCentVertInds");
+					if (sdbind->vert_inds == NULL) {
+						data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+						return;
+					}
+
+					sortPolyVertsEdge(sdbind->vert_inds, &data->mloop[bpoly->loopstart],
+					                  bpoly->edge_inds[bpoly->dominant_edge], bpoly->numverts);
+
+					copy_v3_v3(v1, data->targetCos[sdbind->vert_inds[0]]);
+					copy_v3_v3(v2, data->targetCos[sdbind->vert_inds[1]]);
+					copy_v3_v3(v3, bpoly->centroid);
+
+					mid_v3_v3v3v3(cent, v1, v2, v3);
+					normal_tri_v3(norm, v1, v2, v3);
+
+					add_v3_v3v3(tmp_vec, point_co, bpoly->normal);
+
+					/* We are sure the line is not parallel to the plane.
+					 * Checking return value just to avoid warning... */
+					if (!isect_line_plane_v3(point_co_proj, point_co, tmp_vec, cent, norm)) {
+						BLI_assert(false);
+					}
+
+					interp_weights_tri_v3(sdbind->vert_weights, v1, v2, v3, point_co_proj);
+
+					sdbind->normal_dist = computeNormalDisplacement(point_co, point_co_proj, bpoly->normal);
+
+					sdbind++;
+					i++;
+				}
+
+				if (bpoly->dominant_angle_weight >= FLT_EPSILON) {
+					sdbind->influence = bpoly->weight * bpoly->dominant_angle_weight;
+					sdbind->numverts = bpoly->numverts;
+
+					sdbind->mode = MOD_SDEF_MODE_LOOPTRI;
+					sdbind->vert_weights = MEM_mallocN(sizeof(*sdbind->vert_weights) * 3, "SDefTriVertWeights");
+					if (sdbind->vert_weights == NULL) {
+						data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+						return;
+					}
+
+					sdbind->vert_inds = MEM_mallocN(sizeof(*sdbind->vert_inds) * bpoly->numverts, "SDefTriVertInds");
+					if (sdbind->vert_inds == NULL) {
+						data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+						return;
+					}
+
+					sortPolyVertsTri(sdbind->vert_inds, &data->mloop[bpoly->loopstart], bpoly->edge_vert_inds[0], bpoly->numverts);
+
+					copy_v3_v3(v1, data->targetCos[sdbind->vert_inds[0]]);
+					copy_v3_v3(v2, data->targetCos[sdbind->vert_inds[1]]);
+					copy_v3_v3(v3, data->targetCos[sdbind->vert_inds[2]]);
+
+					mid_v3_v3v3v3(cent, v1, v2, v3);
+					normal_tri_v3(norm, v1, v2, v3);
+
+					add_v3_v3v3(tmp_vec, point_co, bpoly->normal);
+
+					/* We are sure the line is not parallel to the plane.
+					 * Checking return value just to avoid warning... */
+					if (!isect_line_plane_v3(point_co_proj, point_co, tmp_vec, cent, norm)) {
+						BLI_assert(false);
+					}
+
+					interp_weights_tri_v3(sdbind->vert_weights, v1, v2, v3, point_co_proj);
+
+					sdbind->normal_dist = computeNormalDisplacement(point_co, point_co_proj, bpoly->normal);
+
+					sdbind++;
+					i++;
+				}
+			}
+		}
+	}
+
+	freeBindData(bwdata);
+}
+
+static bool surfacedeformBind(SurfaceDeformModifierData *smd, float (*vertexCos)[3],
+                              unsigned int numverts, unsigned int tnumpoly, unsigned int tnumverts, DerivedMesh *tdm)
+{
+	BVHTreeFromMesh treeData = {NULL};
+	const MVert *mvert = tdm->getVertArray(tdm);
+	const MPoly *mpoly = tdm->getPolyArray(tdm);
+	const MEdge *medge = tdm->getEdgeArray(tdm);
+	const MLoop *mloop = tdm->getLoopArray(tdm);
+	unsigned int tnumedges = tdm->getNumEdges(tdm);
+	int adj_result;
+	SDefAdjacencyArray *vert_edges;
+	SDefAdjacency *adj_array;
+	SDefEdgePolys *edge_polys;
+
+	vert_edges = MEM_callocN(sizeof(*vert_edges) * tnumverts, "SDefVertEdgeMap");
+	if (vert_edges == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		return false;
+	}
+
+	adj_array = MEM_mallocN(sizeof(*adj_array) * tnumedges * 2, "SDefVertEdge");
+	if (adj_array == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		MEM_freeN(vert_edges);
+		return false;
+	}
+
+	edge_polys = MEM_callocN(sizeof(*edge_polys) * tnumedges, "SDefEdgeFaceMap");
+	if (edge_polys == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		MEM_freeN(vert_edges);
+		MEM_freeN(adj_array);
+		return false;
+	}
+
+	smd->verts = MEM_mallocN(sizeof(*smd->verts) * numverts, "SDefBindVerts");
+	if (smd->verts == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		freeAdjacencyMap(vert_edges, adj_array, edge_polys);
+		return false;
+	}
+
+	bvhtree_from_mesh_looptri(&treeData, tdm, 0.0, 2, 6);
+	if (treeData.tree == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		freeAdjacencyMap(vert_edges, adj_array, edge_polys);
+		MEM_freeN(smd->verts);
+		smd->verts = NULL;
+		return false;
+	}
+
+	adj_result = buildAdjacencyMap(mpoly, medge, mloop, tnumpoly, tnumedges, vert_edges, adj_array, edge_polys);
+
+	if (adj_result == MOD_SDEF_BIND_RESULT_NONMANY_ERR) {
+		modifier_setError((ModifierData *)smd, "Target has edges with more than two polys");
+		freeAdjacencyMap(vert_edges, adj_array, edge_polys);
+		free_bvhtree_from_mesh(&treeData);
+		MEM_freeN(smd->verts);
+		smd->verts = NULL;
+		return false;
+	}
+
+	smd->numverts = numverts;
+	smd->numpoly = tnumpoly;
+
+	SDefBindCalcData data = {.treeData = &treeData,
+		                     .vert_edges = vert_edges,
+		                     .edge_polys = edge_polys,
+		                     .mpoly = mpoly,
+		                     .medge = medge,
+		                     .mloop = mloop,
+		                     .looptri = tdm->getLoopTriArray(tdm),
+		                     .targetCos = MEM_mallocN(sizeof(float[3]) * tnumverts, "SDefTargetBindVertArray"),
+		                     .bind_verts = smd->verts,
+		                     .vertexCos = vertexCos,
+		                     .falloff = smd->falloff,
+		                     .success = MOD_SDEF_BIND_RESULT_SUCCESS};
+
+	if (data.targetCos == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		freeData((ModifierData *)smd);
+		return false;
+	}
+
+	invert_m4_m4(data.imat, smd->mat);
+
+	for (int i = 0; i < tnumverts; i++) {
+		mul_v3_m4v3(data.targetCos[i], smd->mat, mvert[i].co);
+	}
+
+	BLI_task_parallel_range_ex(0, numverts, &data, NULL, 0, bindVert,
+	                           numverts > 10000, false);
+
+	MEM_freeN(data.targetCos);
+
+	if (data.success == MOD_SDEF_BIND_RESULT_MEM_ERR) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		freeData((ModifierData *)smd);
+	}
+	else if (data.success == MOD_SDEF_BIND_RESULT_NONMANY_ERR) {
+		modifier_setError((ModifierData *)smd, "Target has edges with more than two polys");
+		freeData((ModifierData *)smd);
+	}
+	else if (data.success == MOD_SDEF_BIND_RESULT_CONCAVE_ERR) {
+		modifier_setError((ModifierData *)smd, "Target contains concave polys");
+		freeData((ModifierData *)smd);
+	}
+	else if (data.success == MOD_SDEF_BIND_RESULT_OVERLAP_ERR) {
+		modifier_setError((ModifierData *)smd, "Target contains overlapping verts");
+		freeData((ModifierData *)smd);
+	}
+	else if (data.success == MOD_SDEF_BIND_RESULT_GENERIC_ERR) {
+		/* I know this message is vague, but I could not think of a way
+		 * to explain this whith a reasonably sized message.
+		 * Though it shouldn't really matter all that much,
+		 * because this is very unlikely to occur */
+		modifier_setError((ModifierData *)smd, "Target contains invalid polys");
+		freeData((ModifierData *)smd);
+	}
+
+	freeAdjacencyMap(vert_edges, adj_array, edge_polys);
+	free_bvhtree_from_mesh(&treeData);
+
+	return data.success == 1;
+}
+
+static void deformVert(void *userdata, void *UNUSED(userdata_chunk), const int index, const int UNUSED(threadid))
+{
+	const SDefDeformData * const data = (SDefDeformData *)userdata;
+	const SDefBind *sdbind = data->bind_verts[index].binds;
+	float * const vertexCos = data->vertexCos[index];
+	float norm[3], temp[3];
+
+	zero_v3(vertexCos);
+
+	for (int j = 0; j < data->bind_verts[index].numbinds; j++, sdbind++) {
+		/* Mode-generic operations (allocate poly coordinates) */
+		float (*coords)[3] = MEM_mallocN(sizeof(*coords) * sdbind->numverts, "SDefDoPolyCoords");
+
+		for (int k = 0; k < sdbind->numverts; k++) {
+			copy_v3_v3(coords[k], data->targetCos[sdbind->vert_inds[k]]);
+		}
+
+		normal_poly_v3(norm, coords, sdbind->numverts);
+		zero_v3(temp);
+
+		/* ---------- looptri mode ---------- */
+		if (sdbind->mode == MOD_SDEF_MODE_LOOPTRI) {
+			madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[0]], sdbind->vert_weights[0]);
+			madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[1]], sdbind->vert_weights[1]);
+			madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[2]], sdbind->vert_weights[2]);
+		}
+		else {
+			/* ---------- ngon mode ---------- */
+			if (sdbind->mode == MOD_SDEF_MODE_NGON) {
+				for (int k = 0; k < sdbind->numverts; k++) {
+					madd_v3_v3fl(temp, coords[k], sdbind->vert_weights[k]);
+				}
+			}
+
+			/* ---------- centroid mode ---------- */
+			else if (sdbind->mode == MOD_SDEF_MODE_CENTROID) {
+				float cent[3];
+				mid_v3_v3_array(cent, coords, sdbind->numverts);
+
+				madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[0]], sdbind->vert_weights[0]);
+				madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[1]], sdbind->vert_weights[1]);
+				madd_v3_v3fl(temp, cent, sdbind->vert_weights[2]);
+			}
+		}
+
+		MEM_freeN(coords);
+
+		/* Apply normal offset (generic for all modes) */
+		madd_v3_v3fl(temp, norm, sdbind->normal_dist);
+
+		madd_v3_v3fl(vertexCos, temp, sdbind->influence);
+	}
+}
+
+static void surfacedeformModifier_do(ModifierData *md, float (*vertexCos)[3], unsigned int numverts, Object *ob)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+	DerivedMesh *tdm;
+	unsigned int tnumverts, tnumpoly;
+
+	/* Exit function if bind flag is not set (free bind data if any) */
+	if (!(smd->flags & MOD_SDEF_BIND)) {
+		freeData(md);
+		return;
+	}
+
+	/* Handle target mesh both in and out of edit mode */
+	if (smd->target == md->scene->obedit) {
+		BMEditMesh *em = BKE_editmesh_from_object(smd->target);
+		tdm = em->derivedFinal;
+	}
+	else {
+		tdm = smd->target->derivedFinal;
+	}
+
+	tnumverts = tdm->getNumVerts(tdm);
+	tnumpoly = tdm->getNumPolys(tdm);
+
+	/* If not bound, execute bind */
+	if (!(smd->verts)) {
+		float tmp_mat[4][4];
+
+		invert_m4_m4(tmp_mat, ob->obmat);
+		mul_m4_m4m4(smd->mat, tmp_mat, smd->target->obmat);
+
+		if (!surfacedeformBind(smd, vertexCos, numverts, tnumpoly, tnumverts, tdm)) {
+			smd->flags &= ~MOD_SDEF_BIND;
+			return;
+		}
+	}
+
+	/* Poly count checks */
+	if (smd->numverts != numverts) {
+		modifier_setError(md, "Verts changed from %u to %u", smd->numverts, numverts);
+		tdm->release(tdm);
+		return;
+	}
+	else if (smd->numpoly != tnumpoly) {
+		modifier_setError(md, "Target polygons changed from %u to %u", smd->numpoly, tnumpoly);
+		tdm->release(tdm);
+		return;
+	}
+
+	/* Actual vertex location update starts here */
+	SDefDeformData data = {.bind_verts = smd->verts,
+		                   .targetCos = MEM_mallocN(sizeof(float[3]) * tnumverts, "SDefTargetVertArray"),
+		                   .vertexCos = vertexCos};
+
+	if (data.targetCos != NULL) {
+		bool tdm_vert_alloc;
+		const MVert * const mvert = DM_get_vert_array(tdm, &tdm_vert_alloc);
+
+		for (int i = 0; i < tnumverts; i++) {
+			mul_v3_m4v3(data.targetCos[i], smd->mat, mvert[i].co);
+		}
+
+		BLI_task_parallel_range_ex(0, numverts, &data, NULL, 0, deformVert,
+		                           numverts > 10000, false);
+
+		if (tdm_vert_alloc) {
+			MEM_freeN((void *)mvert);
+		}
+
+		MEM_freeN(data.targetCos);
+	}
+
+	tdm->release(tdm);
+}
+
+static void deformVerts(ModifierData *md, Object *ob,
+                        DerivedMesh *UNUSED(derivedData),
+                        float (*vertexCos)[3], int numVerts,
+                        ModifierApplyFlag UNUSED(flag))
+{
+	surfacedeformModifier_do(md, vertexCos, numVerts, ob);
+}
+
+static void deformVertsEM(ModifierData *md, Object *ob,
+                          struct BMEditMesh *UNUSED(editData),
+                          DerivedMesh *UNUSED(derivedData),
+                          float (*vertexCos)[3], int numVerts)
+{
+	surfacedeformModifier_do(md, vertexCos, numVerts, ob);
+}
+
+static bool isDisabled(ModifierData *md, int UNUSED(useRenderParams))
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+	return !smd->target;
+}
+
+ModifierTypeInfo modifierType_SurfaceDeform = {
+	/* name */              "Surface Deform",
+	/* structName */        "SurfaceDeformModifierData",
+	/* structSize */        sizeof(SurfaceDeformModifierData),
+	/* type */              eModifierTypeType_OnlyDeform,
+	/* flags */             eModifierTypeFlag_AcceptsMesh |
+	                        eModifierTypeFlag_SupportsEditmode,
+
+	/* copyData */          copyData,
+	/* deformVerts */       deformVerts,
+	/* deformMatrices */    NULL,
+	/* deformVertsEM */     deformVertsEM,
+	/* deformMatricesEM */  NULL,
+	/* applyModifier */     NULL,
+	/* applyModifierEM */   NULL,
+	/* initData */          initData,
+	/* requiredDataMask */  NULL,
+	/* freeData */          freeData,
+	/* isDisabled */        isDisabled,
+	/* updateDepgraph */    updateDepgraph,
+	/* updateDepsgraph */   updateDepsgraph,
+	/* dependsOnTime */     NULL,
+	/* dependsOnNormals */  NULL,
+	/* foreachObjectLink */ foreachObjectLink,
+	/* foreachIDLink */     NULL,
+	/* foreachTexLink */    NULL,
+};
diff --git a/source/blender/modifiers/intern/MOD_util.c b/source/blender/modifiers/intern/MOD_util.c
index 93414562ccf..ded1f0b77e6 100644
--- a/source/blender/modifiers/intern/MOD_util.c
+++ b/source/blender/modifiers/intern/MOD_util.c
@@ -287,5 +287,6 @@ void modifier_type_init(ModifierTypeInfo *types[])
 	INIT_TYPE(NormalEdit);
 	INIT_TYPE(CorrectiveSmooth);
 	INIT_TYPE(MeshSequenceCache);
+	INIT_TYPE(SurfaceDeform);
 #undef INIT_TYPE
 }
diff --git a/source/blender/nodes/composite/nodes/node_composite_glare.c b/source/blender/nodes/composite/nodes/node_composite_glare.c
index c512ea49586..76020e55463 100644
--- a/source/blender/nodes/composite/nodes/node_composite_glare.c
+++ b/source/blender/nodes/composite/nodes/node_composite_glare.c
@@ -50,7 +50,8 @@ static void node_composit_init_glare(bNodeTree *UNUSED(ntree), bNode *node)
 	ndg->colmod = 0.25;
 	ndg->mix = 0;
 	ndg->threshold = 1;
-	ndg->angle = 4;
+	ndg->star_45 = true;
+	ndg->streaks = 4;
 	ndg->angle_ofs = 0.0f;
 	ndg->fade = 0.9;
 	ndg->size = 8;
diff --git a/source/blender/nodes/shader/nodes/node_shader_fresnel.c b/source/blender/nodes/shader/nodes/node_shader_fresnel.c
index d5e11795fc0..5a9e33a4053 100644
--- a/source/blender/nodes/shader/nodes/node_shader_fresnel.c
+++ b/source/blender/nodes/shader/nodes/node_shader_fresnel.c
@@ -64,10 +64,11 @@ static void node_shader_exec_fresnel(void *data, int UNUSED(thread), bNode *UNUS
 		copy_v3_v3(n, shi->vn);
 	}
 
-	if(shi->use_world_space_shading)
+	if (shi->use_world_space_shading) {
 		mul_mat3_m4_v3((float (*)[4])RE_render_current_get_matrix(RE_VIEW_MATRIX), n);
+	}
 
-	out[0]->vec[0] = RE_fresnel_dielectric(shi->view, n, shi->flippednor ? 1/eta : eta);
+	out[0]->vec[0] = RE_fresnel_dielectric(shi->view, n, shi->flippednor ? 1 / eta : eta);
 }
 
 /* node type definition */
diff --git a/source/blender/nodes/shader/nodes/node_shader_layer_weight.c b/source/blender/nodes/shader/nodes/node_shader_layer_weight.c
index 90e2625b961..a0b2408a7bb 100644
--- a/source/blender/nodes/shader/nodes/node_shader_layer_weight.c
+++ b/source/blender/nodes/shader/nodes/node_shader_layer_weight.c
@@ -69,7 +69,7 @@ static void node_shader_exec_layer_weight(void *data, int UNUSED(thread), bNode
 	if (shi->use_world_space_shading)
 		mul_mat3_m4_v3((float (*)[4])RE_render_current_get_matrix(RE_VIEW_MATRIX), n);
 
-	out[0]->vec[0] = RE_fresnel_dielectric(shi->view, n, shi->flippednor ? eta : 1/eta);
+	out[0]->vec[0] = RE_fresnel_dielectric(shi->view, n, shi->flippednor ? eta : 1 / eta);
 
 	float facing = fabs(dot_v3v3(shi->view, n));
 	if (blend != 0.5) {
diff --git a/source/blender/nodes/shader/nodes/node_shader_tex_brick.c b/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
index 0be47c4f751..1dfebc45d60 100644
--- a/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
+++ b/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
@@ -64,7 +64,7 @@ static void node_shader_init_tex_brick(bNodeTree *UNUSED(ntree), bNode *node)
 
 	for (bNodeSocket *sock = node->inputs.first; sock; sock = sock->next) {
 		if (STREQ(sock->name, "Mortar Smooth")) {
-			((bNodeSocketValueFloat*)sock->default_value)->value = 0.1f;
+			((bNodeSocketValueFloat *)sock->default_value)->value = 0.1f;
 		}
 	}
 }
diff --git a/source/blender/python/intern/gpu_offscreen.c b/source/blender/python/intern/gpu_offscreen.c
index c4863b2a92f..7711ce18bd0 100644
--- a/source/blender/python/intern/gpu_offscreen.c
+++ b/source/blender/python/intern/gpu_offscreen.c
@@ -202,7 +202,7 @@ static PyObject *pygpu_offscreen_draw_view3d(BPy_GPUOffScreen *self, PyObject *a
 	ARegion *ar;
 	GPUFX *fx;
 	GPUFXSettings fx_settings;
-	void *rv3d_mats;
+	struct RV3DMatrixStore *rv3d_mats;
 
 	BPY_GPU_OFFSCREEN_CHECK_OBJ(self);
 
diff --git a/source/blender/render/CMakeLists.txt b/source/blender/render/CMakeLists.txt
index 9e40ab02ee4..569b207c966 100644
--- a/source/blender/render/CMakeLists.txt
+++ b/source/blender/render/CMakeLists.txt
@@ -35,6 +35,7 @@ set(INC
 	../makesdna
 	../makesrna
 	../physics
+	../../../intern/atomic
 	../../../intern/guardedalloc
 	../../../intern/mikktspace
 	../../../intern/smoke/extern
diff --git a/source/blender/render/intern/source/pointdensity.c b/source/blender/render/intern/source/pointdensity.c
index a03ea9cb896..fb047aad897 100644
--- a/source/blender/render/intern/source/pointdensity.c
+++ b/source/blender/render/intern/source/pointdensity.c
@@ -983,11 +983,12 @@ void RE_point_density_minmax(
 	}
 	else {
 		float radius[3] = {pd->radius, pd->radius, pd->radius};
-		float *loc, *size;
+		BoundBox *bb = BKE_object_boundbox_get(object);
 
-		if (BKE_object_obdata_texspace_get(pd->object, NULL, &loc, &size, NULL)) {
-			sub_v3_v3v3(r_min, loc, size);
-			add_v3_v3v3(r_max, loc, size);
+		if (bb != NULL) {
+			BLI_assert((bb->flag & BOUNDBOX_DIRTY) == 0);
+			copy_v3_v3(r_min, bb->vec[0]);
+			copy_v3_v3(r_max, bb->vec[6]);
 			/* Adjust texture space to include density points on the boundaries. */
 			sub_v3_v3(r_min, radius);
 			add_v3_v3(r_max, radius);
diff --git a/source/blender/render/intern/source/volume_precache.c b/source/blender/render/intern/source/volume_precache.c
index 5377d0eba00..752a9df0b79 100644
--- a/source/blender/render/intern/source/volume_precache.c
+++ b/source/blender/render/intern/source/volume_precache.c
@@ -60,6 +60,8 @@
 #include "volumetric.h"
 #include "volume_precache.h"
 
+#include "atomic_ops.h"
+
 
 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
 /* defined in pipeline.c, is hardcopy of active dynamic allocated Render */
@@ -509,7 +511,8 @@ static void *vol_precache_part_test(void *data)
  */
 typedef struct VolPrecacheState {
 	double lasttime;
-	int totparts;
+	unsigned int doneparts;
+	unsigned int totparts;
 } VolPrecacheState;
 
 static void vol_precache_part(TaskPool * __restrict pool, void *taskdata, int UNUSED(threadid))
@@ -574,13 +577,15 @@ static void vol_precache_part(TaskPool * __restrict pool, void *taskdata, int UN
 		}
 	}
 
+	unsigned int doneparts = atomic_add_and_fetch_u(&state->doneparts, 1);
+
 	time = PIL_check_seconds_timer();
 	if (time - state->lasttime > 1.0) {
 		ThreadMutex *mutex = BLI_task_pool_user_mutex(pool);
 
 		if (BLI_mutex_trylock(mutex)) {
 			char str[64];
-			float ratio = (float)BLI_task_pool_tasks_done(pool)/(float)state->totparts;
+			float ratio = (float)doneparts/(float)state->totparts;
 			BLI_snprintf(str, sizeof(str), IFACE_("Precaching volume: %d%%"), (int)(100.0f * ratio));
 			re->i.infostr = str;
 			re->stats_draw(re->sdh, &re->i);
@@ -631,6 +636,7 @@ static void precache_launch_parts(Render *re, RayObject *tree, ShadeInput *shi,
 	
 	/* setup task scheduler */
 	memset(&state, 0, sizeof(state));
+	state.doneparts = 0;
 	state.totparts = parts[0]*parts[1]*parts[2];
 	state.lasttime = PIL_check_seconds_timer();
 	
diff --git a/source/blender/windowmanager/intern/wm_event_system.c b/source/blender/windowmanager/intern/wm_event_system.c
index d2b0acd836b..eba132062c9 100644
--- a/source/blender/windowmanager/intern/wm_event_system.c
+++ b/source/blender/windowmanager/intern/wm_event_system.c
@@ -3179,6 +3179,8 @@ void wm_event_add_ghostevent(wmWindowManager *wm, wmWindow *win, int type, int U
 			GHOST_TEventCursorData *cd = customdata;
 
 			copy_v2_v2_int(&event.x, &cd->x);
+			wm_stereo3d_mouse_offset_apply(win, &event.x);
+
 			event.type = MOUSEMOVE;
 			wm_event_add_mousemove(win, &event);
 			copy_v2_v2_int(&evt->x, &event.x);
diff --git a/source/blender/windowmanager/intern/wm_init_exit.c b/source/blender/windowmanager/intern/wm_init_exit.c
index c11c398c616..4b2369a1a7c 100644
--- a/source/blender/windowmanager/intern/wm_init_exit.c
+++ b/source/blender/windowmanager/intern/wm_init_exit.c
@@ -444,8 +444,6 @@ void WM_exit_ext(bContext *C, const bool do_python)
 {
 	wmWindowManager *wm = C ? CTX_wm_manager(C) : NULL;
 
-	BKE_sound_exit();
-
 	/* first wrap up running stuff, we assume only the active WM is running */
 	/* modal handlers are on window level freed, others too? */
 	/* note; same code copied in wm_files.c */
@@ -591,6 +589,10 @@ void WM_exit_ext(bContext *C, const bool do_python)
 
 	BLI_threadapi_exit();
 
+	/* No need to call this early, rather do it late so that other pieces of Blender using sound may exit cleanly,
+	 * see also T50676. */
+	BKE_sound_exit();
+
 	BKE_blender_atexit();
 
 	if (MEM_get_memory_blocks_in_use() != 0) {
diff --git a/source/blender/windowmanager/intern/wm_stereo.c b/source/blender/windowmanager/intern/wm_stereo.c
index 46cee907991..66ebf18c9e1 100644
--- a/source/blender/windowmanager/intern/wm_stereo.c
+++ b/source/blender/windowmanager/intern/wm_stereo.c
@@ -345,6 +345,32 @@ bool WM_stereo3d_enabled(wmWindow *win, bool skip_stereo3d_check)
 	return true;
 }
 
+/**
+ * If needed, this adjusts \a r_mouse_xy so that drawn cursor and handled mouse position are matching visually.
+*/
+void wm_stereo3d_mouse_offset_apply(wmWindow *win, int *r_mouse_xy)
+{
+	if (!WM_stereo3d_enabled(win, false))
+		return;
+
+	if (win->stereo3d_format->display_mode == S3D_DISPLAY_SIDEBYSIDE) {
+		const int half_x = win->sizex / 2;
+		/* right half of the screen */
+		if (r_mouse_xy[0] > half_x) {
+			r_mouse_xy[0] -= half_x;
+		}
+		r_mouse_xy[0] *= 2;
+	}
+	else if (win->stereo3d_format->display_mode == S3D_DISPLAY_TOPBOTTOM) {
+		const int half_y = win->sizey / 2;
+		/* upper half of the screen */
+		if (r_mouse_xy[1] > half_y) {
+			r_mouse_xy[1] -= half_y;
+		}
+		r_mouse_xy[1] *= 2;
+	}
+}
+
 /************************** Stereo 3D operator **********************************/
 typedef struct Stereo3dData {
 	Stereo3dFormat stereo3d_format;
diff --git a/source/blender/windowmanager/wm.h b/source/blender/windowmanager/wm.h
index 2f06ddab1e8..e8485359490 100644
--- a/source/blender/windowmanager/wm.h
+++ b/source/blender/windowmanager/wm.h
@@ -78,6 +78,7 @@ void wm_autosave_location(char *filepath);
 
 /* wm_stereo.c */
 void wm_method_draw_stereo3d(const bContext *C, wmWindow *win);
+void wm_stereo3d_mouse_offset_apply(wmWindow *win, int *r_mouse_xy);
 int wm_stereo3d_set_exec(bContext *C, wmOperator *op);
 int wm_stereo3d_set_invoke(bContext *C, wmOperator *op, const wmEvent *event);
 void wm_stereo3d_set_draw(bContext *C, wmOperator *op);
diff --git a/source/creator/creator_args.c b/source/creator/creator_args.c
index 27579e58dba..658a0b2db08 100644
--- a/source/creator/creator_args.c
+++ b/source/creator/creator_args.c
@@ -946,7 +946,7 @@ static int arg_handle_native_pixels_set(int UNUSED(argc), const char **UNUSED(ar
 }
 
 static const char arg_handle_with_borders_doc[] =
-"\n\tForce opening without borders"
+"\n\tForce opening with borders"
 ;
 static int arg_handle_with_borders(int UNUSED(argc), const char **UNUSED(argv), void *UNUSED(data))
 {
@@ -1364,7 +1364,7 @@ static int arg_handle_render_frame(int argc, const char **argv, void *data)
 
 			re = RE_NewRender(scene->id.name);
 			BLI_begin_threaded_malloc();
-			BKE_reports_init(&reports, RPT_PRINT);
+			BKE_reports_init(&reports, RPT_STORE);
 
 			RE_SetReports(re, &reports);
 			for (int i = 0; i < frames_range_len; i++) {
@@ -1379,6 +1379,7 @@ static int arg_handle_render_frame(int argc, const char **argv, void *data)
 				}
 			}
 			RE_SetReports(re, NULL);
+			BKE_reports_clear(&reports);
 			BLI_end_threaded_malloc();
 			MEM_freeN(frame_range_arr);
 			return 1;
@@ -1406,10 +1407,11 @@ static int arg_handle_render_animation(int UNUSED(argc), const char **UNUSED(arg
 		Render *re = RE_NewRender(scene->id.name);
 		ReportList reports;
 		BLI_begin_threaded_malloc();
-		BKE_reports_init(&reports, RPT_PRINT);
+		BKE_reports_init(&reports, RPT_STORE);
 		RE_SetReports(re, &reports);
 		RE_BlenderAnim(re, bmain, scene, NULL, scene->lay, scene->r.sfra, scene->r.efra, scene->r.frame_step);
 		RE_SetReports(re, NULL);
+		BKE_reports_clear(&reports);
 		BLI_end_threaded_malloc();
 	}
 	else {
diff --git a/source/gameengine/VideoTexture/FilterColor.cpp b/source/gameengine/VideoTexture/FilterColor.cpp
index eed84a8580c..15a7e9e4cd1 100644
--- a/source/gameengine/VideoTexture/FilterColor.cpp
+++ b/source/gameengine/VideoTexture/FilterColor.cpp
@@ -68,7 +68,7 @@ PyTypeObject FilterGrayType =
 	0,                         /*tp_setattro*/
 	0,                         /*tp_as_buffer*/
 	Py_TPFLAGS_DEFAULT,        /*tp_flags*/
-	"Filter for gray scale effect",       /* tp_doc */
+	"Filter for grayscale effect",       /* tp_doc */
 	0,		               /* tp_traverse */
 	0,		               /* tp_clear */
 	0,		               /* tp_richcompare */
diff --git a/source/gameengine/VideoTexture/FilterColor.h b/source/gameengine/VideoTexture/FilterColor.h
index 350f7270874..d042863d7e8 100644
--- a/source/gameengine/VideoTexture/FilterColor.h
+++ b/source/gameengine/VideoTexture/FilterColor.h
@@ -36,7 +36,7 @@
 #include "FilterBase.h"
 
 
-/// pixel filter for gray scale
+/// pixel filter for grayscale
 class FilterGray : public FilterBase
 {
 public:
@@ -53,7 +53,7 @@ protected:
 		// calculate gray value
 		unsigned int gray = (28 * (VT_B(val)) + 151 * (VT_G(val))
 			+ 77 * (VT_R(val))) >> 8;
-		// return gray scale value
+		// return grayscale value
 		VT_R(val) = gray;
 		VT_G(val) = gray;
 		VT_B(val) = gray;
diff --git a/tests/gtests/blenlib/BLI_array_store_test.cc b/tests/gtests/blenlib/BLI_array_store_test.cc
index 5af6e639e64..370a4111bae 100644
--- a/tests/gtests/blenlib/BLI_array_store_test.cc
+++ b/tests/gtests/blenlib/BLI_array_store_test.cc
@@ -36,15 +36,15 @@ static void print_mem_saved(const char *id, const BArrayStore *bs)
 /* -------------------------------------------------------------------- */
 /* Test Chunks (building data from list of chunks) */
 
-typedef struct TestChunnk {
-	struct TestChunnk *next, *prev;
+typedef struct TestChunk {
+	struct TestChunk *next, *prev;
 	const void *data;
 	size_t data_len;
-} TestChunnk;
+} TestChunk;
 
-static TestChunnk *testchunk_list_add(ListBase *lb, const void *data, size_t data_len)
+static TestChunk *testchunk_list_add(ListBase *lb, const void *data, size_t data_len)
 {
-	TestChunnk *tc = (TestChunnk *)MEM_mallocN(sizeof(*tc), __func__);
+	TestChunk *tc = (TestChunk *)MEM_mallocN(sizeof(*tc), __func__);
 	tc->data = data;
 	tc->data_len = data_len;
 	BLI_addtail(lb, tc);
@@ -53,7 +53,7 @@ static TestChunnk *testchunk_list_add(ListBase *lb, const void *data, size_t dat
 }
 
 #if 0
-static TestChunnk *testchunk_list_add_copydata(ListBase *lb, const void *data, size_t data_len)
+static TestChunk *testchunk_list_add_copydata(ListBase *lb, const void *data, size_t data_len)
 {
 	void *data_copy = MEM_mallocN(data_len, __func__);
 	memcpy(data_copy, data, data_len);
@@ -63,7 +63,7 @@ static TestChunnk *testchunk_list_add_copydata(ListBase *lb, const void *data, s
 
 static void testchunk_list_free(ListBase *lb)
 {
-	for (TestChunnk *tc = (TestChunnk *)lb->first, *tb_next; tc; tc = tb_next) {
+	for (TestChunk *tc = (TestChunk *)lb->first, *tb_next; tc; tc = tb_next) {
 		tb_next = tc->next;
 		MEM_freeN((void *)tc->data);
 		MEM_freeN(tc);
@@ -77,12 +77,12 @@ static char *testchunk_as_data(
         size_t *r_data_len)
 {
 	size_t data_len = 0;
-	for (TestChunnk *tc = (TestChunnk *)lb->first; tc; tc = tc->next) {
+	for (TestChunk *tc = (TestChunk *)lb->first; tc; tc = tc->next) {
 		data_len += tc->data_len;
 	}
 	char *data = (char *)MEM_mallocN(data_len, __func__);
 	size_t i = 0;
-	for (TestChunnk *tc = (TestChunnk *)lb->first; tc; tc = tc->next) {
+	for (TestChunk *tc = (TestChunk *)lb->first; tc; tc = tc->next) {
 		memcpy(&data[i], tc->data, tc->data_len);
 		data_len += tc->data_len;
 		i += tc->data_len;
@@ -95,7 +95,7 @@ static char *testchunk_as_data(
 #endif
 
 static char *testchunk_as_data_array(
-        TestChunnk **tc_array, int tc_array_len,
+        TestChunk **tc_array, int tc_array_len,
         size_t *r_data_len)
 {
 	size_t data_len = 0;
@@ -105,7 +105,7 @@ static char *testchunk_as_data_array(
 	char *data = (char *)MEM_mallocN(data_len, __func__);
 	size_t i = 0;
 	for (int tc_index = 0; tc_index < tc_array_len; tc_index++) {
-		TestChunnk *tc = tc_array[tc_index];
+		TestChunk *tc = tc_array[tc_index];
 		memcpy(&data[i], tc->data, tc->data_len);
 		i += tc->data_len;
 	}
@@ -677,9 +677,9 @@ static void random_chunk_mutate_helper(
 	ListBase random_chunks;
 	BLI_listbase_clear(&random_chunks);
 	random_chunk_generate(&random_chunks, chunks_per_buffer, stride, chunk_count, random_seed);
-	TestChunnk **chunks_array = (TestChunnk **)MEM_mallocN(chunks_per_buffer * sizeof(TestChunnk *), __func__);
+	TestChunk **chunks_array = (TestChunk **)MEM_mallocN(chunks_per_buffer * sizeof(TestChunk *), __func__);
 	{
-		TestChunnk *tc = (TestChunnk *)random_chunks.first;
+		TestChunk *tc = (TestChunk *)random_chunks.first;
 		for (int i = 0; i < chunks_per_buffer; i++, tc = tc->next) {
 			chunks_array[i] = tc;
 		}
@@ -692,7 +692,7 @@ static void random_chunk_mutate_helper(
 	{
 		RNG *rng = BLI_rng_new(random_seed);
 		for (int i = 0; i < items_total; i++) {
-			BLI_rng_shuffle_array(rng, chunks_array, sizeof(TestChunnk *), chunks_per_buffer);
+			BLI_rng_shuffle_array(rng, chunks_array, sizeof(TestChunk *), chunks_per_buffer);
 			size_t data_len;
 			char *data = testchunk_as_data_array(chunks_array, chunks_per_buffer, &data_len);
 			BLI_assert(data_len == chunks_per_buffer * chunk_count * stride);
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index 935a2a807fe..b76c47fcf25 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -95,6 +95,11 @@ add_test(bevel ${TEST_BLENDER_EXE}
 	--python-text run_tests
 )
 
+add_test(split_faces ${TEST_BLENDER_EXE}
+    ${TEST_SRC_DIR}/modeling/split_faces_test.blend
+    --python-text run_tests
+)
+
 # ------------------------------------------------------------------------------
 # IO TESTS
 
@@ -421,6 +426,8 @@ if(WITH_CYCLES)
 		if(WITH_OPENGL_TESTS)
 			add_cycles_render_test(opengl)
 		endif()
+		add_cycles_render_test(image)
+		add_cycles_render_test(mblur)
 		add_cycles_render_test(reports)
 		add_cycles_render_test(render)
 		add_cycles_render_test(shader)