Merge branch 'master' into cycles_disney_brdfcycles_disney_brdf

author: Schoen <schoepas@deher1m1598.emea.adsint.biz> 2017-04-03 08:52:05 +0300
committer: Schoen <schoepas@deher1m1598.emea.adsint.biz> 2017-04-03 09:47:02 +0300
commit: 90778901c9ea1e16d5907981e91bceba25ff207d (patch)
tree: 00bf16ee403c3ab1b507a5ec90b098a22e050f7c
parent: 76eebd9379a5dad519ff01cf215fbe3db6de931e (diff)
parent: 3bf0026bec6f839ae0f7c40bde845299ce22e8de (diff)
760 files changed, 23549 insertions, 13520 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 092de982b9d..3a91e1b655a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -445,6 +445,7 @@ option(WITH_BOOST					"Enable features depending on boost" ON)
 
 # Unit testsing
 option(WITH_GTESTS "Enable GTest unit testing" OFF)
+option(WITH_OPENGL_TESTS "Enable OpenGL related unit testing (Experimental)" OFF)
 
 
 # Documentation
@@ -518,18 +519,20 @@ endif()
 option(WITH_LEGACY_DEPSGRAPH "Build Blender with legacy dependency graph" ON)
 mark_as_advanced(WITH_LEGACY_DEPSGRAPH)
 
-# Use hardcoded paths or find_package to find externals
-option(WITH_WINDOWS_FIND_MODULES "Use find_package to locate libraries" OFF)
-mark_as_advanced(WITH_WINDOWS_FIND_MODULES)
+if(WIN32)
+	# Use hardcoded paths or find_package to find externals
+	option(WITH_WINDOWS_FIND_MODULES "Use find_package to locate libraries" OFF)
+	mark_as_advanced(WITH_WINDOWS_FIND_MODULES)
 
-option(WITH_WINDOWS_CODESIGN "Use signtool to sign the final binary." OFF)
-mark_as_advanced(WITH_WINDOWS_CODESIGN)
+	option(WITH_WINDOWS_CODESIGN "Use signtool to sign the final binary." OFF)
+	mark_as_advanced(WITH_WINDOWS_CODESIGN)
 
-set(WINDOWS_CODESIGN_PFX CACHE FILEPATH  "Path to pfx file to use for codesigning.")
-mark_as_advanced(WINDOWS_CODESIGN_PFX)
+	set(WINDOWS_CODESIGN_PFX CACHE FILEPATH  "Path to pfx file to use for codesigning.")
+	mark_as_advanced(WINDOWS_CODESIGN_PFX)
 
-set(WINDOWS_CODESIGN_PFX_PASSWORD CACHE STRING  "password for pfx file used for codesigning.")
-mark_as_advanced(WINDOWS_CODESIGN_PFX_PASSWORD)
+	set(WINDOWS_CODESIGN_PFX_PASSWORD CACHE STRING  "password for pfx file used for codesigning.")
+	mark_as_advanced(WINDOWS_CODESIGN_PFX_PASSWORD)
+endif()
 
 # avoid using again
 option_defaults_clear()
@@ -924,7 +927,7 @@ if(WITH_X11)
 	if(WITH_X11_ALPHA)
 		find_library(X11_Xrender_LIB Xrender  ${X11_LIB_SEARCH_PATH})
 		mark_as_advanced(X11_Xrender_LIB)
-		if (X11_Xrender_LIB)
+		if(X11_Xrender_LIB)
 			list(APPEND PLATFORM_LINKLIBS ${X11_Xrender_LIB})
 		else()
 			set(WITH_X11_ALPHA OFF)
diff --git a/GNUmakefile b/GNUmakefile
index 1fda1a25a92..86964e68873 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,4 +1,4 @@
-# -*- mode: gnumakefile; tab-width: 8; indent-tabs-mode: t; -*-
+# -*- mode: gnumakefile; tab-width: 4; indent-tabs-mode: t; -*-
 # vim: tabstop=4
 #
 # ##### BEGIN GPL LICENSE BLOCK #####
@@ -113,7 +113,7 @@ CMAKE_CONFIG = cmake $(BUILD_CMAKE_ARGS) \
 # X11 spesific
 ifdef DISPLAY
 	CMAKE_CONFIG_TOOL = cmake-gui
-else 
+else
 	CMAKE_CONFIG_TOOL = ccmake
 endif
 
@@ -127,7 +127,7 @@ all: .FORCE
 #	# if test ! -f $(BUILD_DIR)/CMakeCache.txt ; then \
 #	# 	$(CMAKE_CONFIG); \
 #	# fi
-	
+
 #	# do this always incase of failed initial build, could be smarter here...
 	@$(CMAKE_CONFIG)
 
diff --git a/build_files/build_environment/install_deps.sh b/build_files/build_environment/install_deps.sh
index ee49f83b38d..ed1b9a3e62a 100755
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -4252,7 +4252,7 @@ print_info() {
     PRINT "  $_3"
     _buildargs="$_buildargs $_1 $_2 $_3"
     if [ -d $INST/osl ]; then
-      _1="-D CYCLES_OSL=$INST/osl"
+      _1="-D OSL_ROOT_DIR=$INST/osl"
       PRINT "  $_1"
       _buildargs="$_buildargs $_1"
     fi
diff --git a/build_files/buildbot/slave_compile.py b/build_files/buildbot/slave_compile.py
index 860dd1174cf..c8d69c38644 100644
--- a/build_files/buildbot/slave_compile.py
+++ b/build_files/buildbot/slave_compile.py
@@ -91,7 +91,6 @@ if 'cmake' in builder:
             elif builder.startswith('win32'):
                 bits = 32
                 cmake_options.extend(['-G', 'Visual Studio 12 2013'])
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE:FILEPATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin/nvcc.exe')
 
     elif builder.startswith('linux'):
         tokens = builder.split("_")
@@ -111,8 +110,6 @@ if 'cmake' in builder:
             cuda_chroot_name = 'buildbot_' + deb_name + '_x86_64'
             targets = ['player', 'blender', 'cuda']
 
-        cmake_extra_options.append('-DCUDA_NVCC_EXECUTABLE=/usr/local/cuda-8.0/bin/nvcc')
-
     cmake_options.append("-C" + os.path.join(blender_dir, cmake_config_file))
 
     # Prepare CMake options needed to configure cuda binaries compilation.
diff --git a/build_files/cmake/buildinfo.cmake b/build_files/cmake/buildinfo.cmake
index 1f0a283ba0e..a43b99f5d1e 100644
--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -56,7 +56,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
 				string(REGEX REPLACE "[\r\n]+" ";" _git_contains_branches "${_git_contains_branches}")
 				string(REGEX REPLACE ";[ \t]+" ";" _git_contains_branches "${_git_contains_branches}")
 				foreach(_branch ${_git_contains_branches})
-					if (NOT "${_branch}" MATCHES "\\(HEAD.*")
+					if(NOT "${_branch}" MATCHES "\\(HEAD.*")
 						set(MY_WC_BRANCH "${_branch}")
 						break()
 					endif()
diff --git a/build_files/cmake/macros.cmake b/build_files/cmake/macros.cmake
index 09428953a46..f9992ee92b9 100644
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -1574,24 +1574,24 @@ macro(openmp_delayload
 endmacro()
 
 MACRO(WINDOWS_SIGN_TARGET target)
-	if (WITH_WINDOWS_CODESIGN)
-		if (!SIGNTOOL_EXE)
+	if(WITH_WINDOWS_CODESIGN)
+		if(!SIGNTOOL_EXE)
 			error("Codesigning is enabled, but signtool is not found")
 		else()
-			if (WINDOWS_CODESIGN_PFX_PASSWORD)
+			if(WINDOWS_CODESIGN_PFX_PASSWORD)
 				set(CODESIGNPASSWORD /p ${WINDOWS_CODESIGN_PFX_PASSWORD})
 			else()
-				if ($ENV{PFXPASSWORD})
+				if($ENV{PFXPASSWORD})
 					set(CODESIGNPASSWORD /p $ENV{PFXPASSWORD})
 				else()
-					message( FATAL_ERROR "WITH_WINDOWS_CODESIGN is on but WINDOWS_CODESIGN_PFX_PASSWORD not set, and environment variable PFXPASSWORD not found, unable to sign code.")
+					message(FATAL_ERROR "WITH_WINDOWS_CODESIGN is on but WINDOWS_CODESIGN_PFX_PASSWORD not set, and environment variable PFXPASSWORD not found, unable to sign code.")
 				endif()
 			endif()
 			add_custom_command(TARGET ${target}
-						POST_BUILD
-						COMMAND ${SIGNTOOL_EXE} sign /f ${WINDOWS_CODESIGN_PFX} ${CODESIGNPASSWORD} $<TARGET_FILE:${target}>
-						VERBATIM
-				)
+				POST_BUILD
+				COMMAND ${SIGNTOOL_EXE} sign /f ${WINDOWS_CODESIGN_PFX} ${CODESIGNPASSWORD} $<TARGET_FILE:${target}>
+				VERBATIM
+			)
 		endif()
 	endif()
 ENDMACRO()
diff --git a/build_files/cmake/packaging.cmake b/build_files/cmake/packaging.cmake
index c7063ed6772..5b894d96f95 100644
--- a/build_files/cmake/packaging.cmake
+++ b/build_files/cmake/packaging.cmake
@@ -1,5 +1,7 @@
-set(PROJECT_DESCRIPTION  "Blender is a very fast and versatile 3D modeller/renderer.")
-set(PROJECT_COPYRIGHT    "Copyright (C) 2001-2012 Blender Foundation")
+string(TIMESTAMP CURRENT_YEAR "%Y")
+
+set(PROJECT_DESCRIPTION  "Blender is the free and open source 3D creation suite software.")
+set(PROJECT_COPYRIGHT    "Copyright (C) 2001-${CURRENT_YEAR} Blender Foundation")
 set(PROJECT_CONTACT      "foundation@blender.org")
 set(PROJECT_VENDOR       "Blender Foundation")
 
@@ -38,8 +40,8 @@ unset(MY_WC_HASH)
 # Force Package Name
 execute_process(COMMAND date "+%Y%m%d" OUTPUT_VARIABLE CPACK_DATE OUTPUT_STRIP_TRAILING_WHITESPACE)
 string(TOLOWER ${PROJECT_NAME} PROJECT_NAME_LOWER)
-if (MSVC)
-	if ("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+if(MSVC)
+	if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
 		set(PACKAGE_ARCH windows64)
 	else()
 		set(PACKAGE_ARCH windows32)
@@ -48,7 +50,7 @@ else(MSVC)
 	set(PACKAGE_ARCH ${CMAKE_SYSTEM_PROCESSOR})
 endif()
 
-if (CPACK_OVERRIDE_PACKAGENAME)
+if(CPACK_OVERRIDE_PACKAGENAME)
 	set(CPACK_PACKAGE_FILE_NAME ${CPACK_OVERRIDE_PACKAGENAME}-${PACKAGE_ARCH})
 else()
 	set(CPACK_PACKAGE_FILE_NAME ${PROJECT_NAME_LOWER}-${MAJOR_VERSION}.${MINOR_VERSION}.${PATCH_VERSION}-git${CPACK_DATE}.${BUILD_REV}-${PACKAGE_ARCH})
@@ -135,4 +137,3 @@ unset(MINOR_VERSION)
 unset(PATCH_VERSION)
 
 unset(BUILD_REV)
-
diff --git a/build_files/cmake/platform/platform_win32_msvc.cmake b/build_files/cmake/platform/platform_win32_msvc.cmake
index 45a44596e61..3ea93a96290 100644
--- a/build_files/cmake/platform/platform_win32_msvc.cmake
+++ b/build_files/cmake/platform/platform_win32_msvc.cmake
@@ -33,7 +33,7 @@ endmacro()
 macro(windows_find_package package_name
 	)
 	if(WITH_WINDOWS_FIND_MODULES)
-		find_package( ${package_name})
+		find_package(${package_name})
 	endif(WITH_WINDOWS_FIND_MODULES)
 endmacro()
 
diff --git a/doc/python_api/rst/bge.texture.rst b/doc/python_api/rst/bge.texture.rst
index 49f6c4469a4..3028ee653f8 100644
--- a/doc/python_api/rst/bge.texture.rst
+++ b/doc/python_api/rst/bge.texture.rst
@@ -681,7 +681,7 @@ Image classes
 
    .. attribute:: zbuff
 
-      Use depth component of render as grey scale color -  suitable for texture source.
+      Use depth component of render as grayscale color - suitable for texture source.
 
       :type: bool
 
@@ -817,7 +817,7 @@ Image classes
 
    .. attribute:: zbuff
 
-      Use depth component of viewport as grey scale color - suitable for texture source.
+      Use depth component of viewport as grayscale color - suitable for texture source.
 
       :type: bool
 
@@ -1260,8 +1260,8 @@ Filter classes
 
 .. class:: FilterGray
 
-   Filter for gray scale effect.
-   Proportions of R, G and B contributions in the output gray scale are 28:151:77.
+   Filter for grayscale effect.
+   Proportions of R, G and B contributions in the output grayscale are 28:151:77.
 
    .. attribute:: previous
 
diff --git a/doc/python_api/sphinx_doc_gen.py b/doc/python_api/sphinx_doc_gen.py
index ec3131ca19e..47bb323e574 100644
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -427,9 +427,9 @@ if BLENDER_REVISION != "Unknown":
     BLENDER_VERSION_DOTS += " " + BLENDER_REVISION          # '2.62.1 SHA1'
 
 BLENDER_VERSION_PATH = "_".join(blender_version_strings)    # '2_62_1'
-if bpy.app.version_cycle == "release":
-    BLENDER_VERSION_PATH = "%s%s_release" % ("_".join(blender_version_strings[:2]),
-                                             bpy.app.version_char)   # '2_62_release'
+if bpy.app.version_cycle in {"rc", "release"}:
+    # '2_62a_release'
+    BLENDER_VERSION_PATH = "%s%s_release" % ("_".join(blender_version_strings[:2]), bpy.app.version_char)
 
 # --------------------------DOWNLOADABLE FILES----------------------------------
 
diff --git a/doc/python_api/sphinx_doc_update.py b/doc/python_api/sphinx_doc_update.py
index 3d48c1145e1..561e58dec66 100755
--- a/doc/python_api/sphinx_doc_update.py
+++ b/doc/python_api/sphinx_doc_update.py
@@ -96,6 +96,11 @@ def main():
 
     rsync_base = "rsync://%s@%s:%s" % (args.user, args.rsync_server, args.rsync_root)
 
+    blenver = blenver_zip = ""
+    api_name = ""
+    branch = ""
+    is_release = False
+
     # I) Update local mirror using rsync.
     rsync_mirror_cmd = ("rsync", "--delete-after", "-avzz", rsync_base, args.mirror_dir)
     subprocess.run(rsync_mirror_cmd, env=dict(os.environ, RSYNC_PASSWORD=args.password))
@@ -108,19 +113,24 @@ def main():
         subprocess.run(doc_gen_cmd)
 
         # III) Get Blender version info.
-        blenver = blenver_zip = ""
         getver_file = os.path.join(tmp_dir, "blendver.txt")
         getver_script = (""
             "import sys, bpy\n"
             "with open(sys.argv[-1], 'w') as f:\n"
-            "    f.write('%d_%d%s_release\\n' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
-            "            if bpy.app.version_cycle in {'rc', 'release'} else '%d_%d_%d\\n' % bpy.app.version)\n"
-            "    f.write('%d_%d_%d' % bpy.app.version)\n")
+            "    is_release = bpy.app.version_cycle in {'rc', 'release'}\n"
+            "    branch = bpy.app.build_branch.split()[0].decode()\n"
+            "    f.write('%d\\n' % is_release)\n"
+            "    f.write('%s\\n' % branch)\n"
+            "    f.write('%d.%d%s\\n' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
+            "            if is_release else '%s\\n' % branch)\n"
+            "    f.write('%d_%d%s_release' % (bpy.app.version[0], bpy.app.version[1], bpy.app.version_char)\n"
+            "            if is_release else '%d_%d_%d' % bpy.app.version)\n")
         get_ver_cmd = (args.blender, "--background", "-noaudio", "--factory-startup", "--python-exit-code", "1",
                        "--python-expr", getver_script, "--", getver_file)
         subprocess.run(get_ver_cmd)
         with open(getver_file) as f:
-            blenver, blenver_zip = f.read().split("\n")
+            is_release, branch, blenver, blenver_zip = f.read().split("\n")
+            is_release = bool(int(is_release))
         os.remove(getver_file)
 
         # IV) Build doc.
@@ -132,7 +142,7 @@ def main():
         os.chdir(curr_dir)
 
         # V) Cleanup existing matching dir in server mirror (if any), and copy new doc.
-        api_name = "blender_python_api_%s" % blenver
+        api_name = blenver
         api_dir = os.path.join(args.mirror_dir, api_name)
         if os.path.exists(api_dir):
             shutil.rmtree(api_dir)
@@ -150,19 +160,15 @@ def main():
     os.rename(zip_path, os.path.join(api_dir, "%s.zip" % zip_name))
 
     # VII) Create symlinks and html redirects.
-    #~ os.symlink(os.path.join(DEFAULT_SYMLINK_ROOT, api_name, "contents.html"), os.path.join(api_dir, "index.html"))
     os.symlink("./contents.html", os.path.join(api_dir, "index.html"))
-    if blenver.endswith("release"):
-        symlink = os.path.join(args.mirror_dir, "blender_python_api_current")
+    if is_release:
+        symlink = os.path.join(args.mirror_dir, "current")
         os.remove(symlink)
         os.symlink("./%s" % api_name, symlink)
         with open(os.path.join(args.mirror_dir, "250PythonDoc/index.html"), 'w') as f:
             f.write("<html><head><title>Redirecting...</title><meta http-equiv=\"REFRESH\""
                     "content=\"0;url=../%s/\"></head><body>Redirecting...</body></html>" % api_name)
-    else:
-        symlink = os.path.join(args.mirror_dir, "blender_python_api_master")
-        os.remove(symlink)
-        os.symlink("./%s" % api_name, symlink)
+    elif branch == "master":
         with open(os.path.join(args.mirror_dir, "blender_python_api/index.html"), 'w') as f:
             f.write("<html><head><title>Redirecting...</title><meta http-equiv=\"REFRESH\""
                     "content=\"0;url=../%s/\"></head><body>Redirecting...</body></html>" % api_name)
diff --git a/extern/clew/README.blender b/extern/clew/README.blender
index 8355a96db8e..127b6bdac63 100644
--- a/extern/clew/README.blender
+++ b/extern/clew/README.blender
@@ -1,5 +1,5 @@
 Project: OpenCL Wrangler
 URL: https://github.com/OpenCLWrangler/clew
 License: Apache 2.0
-Upstream version: 309a653
+Upstream version: 27a6867
 Local modifications: None
diff --git a/extern/clew/include/clew.h b/extern/clew/include/clew.h
index 2a583c81599..0bbb95b9839 100644
--- a/extern/clew/include/clew.h
+++ b/extern/clew/include/clew.h
@@ -369,7 +369,7 @@ typedef unsigned int cl_GLenum;
 #endif
 
 /* Define basic vector types */
-/* WOrkaround for ppc64el platform: conflicts with bool from C++. */
+/* Workaround for ppc64el platform: conflicts with bool from C++. */
 #if defined( __VEC__ ) && !(defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
    #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
    typedef vector unsigned char     __cl_uchar16;
@@ -2765,11 +2765,40 @@ CLEW_FUN_EXPORT     PFNCLGETGLCONTEXTINFOKHR            __clewGetGLContextInfoKH
 #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
 #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
 #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV   0x4007
+#define CL_DEVICE_PCI_BUS_ID_NV                     0x4008
+#define CL_DEVICE_PCI_SLOT_ID_NV                    0x4009
 
 /*********************************
  * cl_amd_device_attribute_query *
  *********************************/
 #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
+#define CL_DEVICE_TOPOLOGY_AMD                      0x4037
+#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD    0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD  0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
+#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD        0x4049
+#define CL_DEVICE_GFXIP_MAJOR_AMD                   0x404A
+#define CL_DEVICE_GFXIP_MINOR_AMD                   0x404B
+#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD        0x404C
+
+#ifndef CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD
+#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
+
+typedef union
+{
+    struct { cl_uint type; cl_uint data[5]; } raw;
+    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
+} cl_device_topology_amd;
+#endif
 
 /*********************************
  * cl_arm_printf extension
diff --git a/extern/clew/src/clew.c b/extern/clew/src/clew.c
index 3717c74f1a0..dbdddfc0332 100644
--- a/extern/clew/src/clew.c
+++ b/extern/clew/src/clew.c
@@ -15,7 +15,7 @@
 
     typedef HMODULE             CLEW_DYNLIB_HANDLE;
 
-    #define CLEW_DYNLIB_OPEN    LoadLibrary
+    #define CLEW_DYNLIB_OPEN    LoadLibraryA
     #define CLEW_DYNLIB_CLOSE   FreeLibrary
     #define CLEW_DYNLIB_IMPORT  GetProcAddress
 #else
@@ -223,7 +223,7 @@ int clewInit()
     __clewSetCommandQueueProperty       = (PFNCLSETCOMMANDQUEUEPROPERTY     )CLEW_DYNLIB_IMPORT(module, "clSetCommandQueueProperty");
 #endif
     __clewCreateBuffer                  = (PFNCLCREATEBUFFER                )CLEW_DYNLIB_IMPORT(module, "clCreateBuffer");
-    __clewCreateSubBuffer               = (PFNCLCREATESUBBUFFER             )CLEW_DYNLIB_IMPORT(module, "clCreateBuffer");
+    __clewCreateSubBuffer               = (PFNCLCREATESUBBUFFER             )CLEW_DYNLIB_IMPORT(module, "clCreateSubBuffer");
     __clewCreateImage                   = (PFNCLCREATEIMAGE                 )CLEW_DYNLIB_IMPORT(module, "clCreateImage");
     __clewRetainMemObject               = (PFNCLRETAINMEMOBJECT             )CLEW_DYNLIB_IMPORT(module, "clRetainMemObject");
     __clewReleaseMemObject              = (PFNCLRELEASEMEMOBJECT            )CLEW_DYNLIB_IMPORT(module, "clReleaseMemObject");
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h
index 19087117667..4cce29d38ab 100644
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -114,7 +114,7 @@ extern "C" {
 #define cuGLGetDevices cuGLGetDevices_v2
 
 /* Types. */
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
 typedef unsigned long long CUdeviceptr;
 #else
 typedef unsigned int CUdeviceptr;
diff --git a/intern/CMakeLists.txt b/intern/CMakeLists.txt
index 9a5476772ab..bfe230250ae 100644
--- a/intern/CMakeLists.txt
+++ b/intern/CMakeLists.txt
@@ -34,7 +34,7 @@ add_subdirectory(mikktspace)
 add_subdirectory(glew-mx)
 add_subdirectory(eigen)
 
-if (WITH_GAMEENGINE_DECKLINK)
+if(WITH_GAMEENGINE_DECKLINK)
 	add_subdirectory(decklink)
 endif()
 
@@ -62,7 +62,7 @@ if(WITH_IK_ITASC)
 	add_subdirectory(itasc)
 endif()
 
-if(WITH_IK_SOLVER OR WITH_GAMEENGINE OR WITH_MOD_BOOLEAN)
+if(WITH_GAMEENGINE)
 	add_subdirectory(moto)
 endif()
 
diff --git a/intern/atomic/atomic_ops.h b/intern/atomic/atomic_ops.h
index 1107deddf94..1e9528f9ed9 100644
--- a/intern/atomic/atomic_ops.h
+++ b/intern/atomic/atomic_ops.h
@@ -101,11 +101,11 @@ ATOMIC_INLINE size_t atomic_fetch_and_add_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_fetch_and_sub_z(size_t *p, size_t x);
 ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new);
 
-ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x);
-ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new);
+ATOMIC_INLINE unsigned int atomic_add_and_fetch_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_sub_and_fetch_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_fetch_and_add_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_fetch_and_sub_u(unsigned int *p, unsigned int x);
+ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsigned int _new);
 
 /* WARNING! Float 'atomics' are really faked ones, those are actually closer to some kind of spinlock-sync'ed operation,
  *          which means they are only efficient if collisions are highly unlikely (i.e. if probability of two threads
diff --git a/intern/atomic/intern/atomic_ops_ext.h b/intern/atomic/intern/atomic_ops_ext.h
index 8421aa72192..b72c94563fc 100644
--- a/intern/atomic/intern/atomic_ops_ext.h
+++ b/intern/atomic/intern/atomic_ops_ext.h
@@ -113,58 +113,58 @@ ATOMIC_INLINE size_t atomic_cas_z(size_t *v, size_t old, size_t _new)
 
 /******************************************************************************/
 /* unsigned operations. */
-ATOMIC_INLINE unsigned atomic_add_and_fetch_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_add_and_fetch_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned int)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned int)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_sub_and_fetch_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_sub_and_fetch_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned int)atomic_add_and_fetch_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned int)atomic_add_and_fetch_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_fetch_and_add_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_fetch_and_add_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
+	return (unsigned int)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)x);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
+	return (unsigned int)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)x);
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_fetch_and_sub_u(unsigned *p, unsigned x)
+ATOMIC_INLINE unsigned int atomic_fetch_and_sub_u(unsigned int *p, unsigned int x)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
+	return (unsigned int)atomic_fetch_and_add_uint64((uint64_t *)p, (uint64_t)-((int64_t)x));
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
+	return (unsigned int)atomic_fetch_and_add_uint32((uint32_t *)p, (uint32_t)-((int32_t)x));
 #endif
 }
 
-ATOMIC_INLINE unsigned atomic_cas_u(unsigned *v, unsigned old, unsigned _new)
+ATOMIC_INLINE unsigned int atomic_cas_u(unsigned int *v, unsigned int old, unsigned int _new)
 {
-	assert(sizeof(unsigned) == LG_SIZEOF_INT);
+	assert(sizeof(unsigned int) == LG_SIZEOF_INT);
 
 #if (LG_SIZEOF_INT == 8)
-	return (unsigned)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
+	return (unsigned int)atomic_cas_uint64((uint64_t *)v, (uint64_t)old, (uint64_t)_new);
 #elif (LG_SIZEOF_INT == 4)
-	return (unsigned)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
+	return (unsigned int)atomic_cas_uint32((uint32_t *)v, (uint32_t)old, (uint32_t)_new);
 #endif
 }
 
diff --git a/intern/audaspace/intern/AUD_SoftwareDevice.cpp b/intern/audaspace/intern/AUD_SoftwareDevice.cpp
index 15594d340be..f9d65aa2363 100644
--- a/intern/audaspace/intern/AUD_SoftwareDevice.cpp
+++ b/intern/audaspace/intern/AUD_SoftwareDevice.cpp
@@ -365,6 +365,7 @@ bool AUD_SoftwareDevice::AUD_SoftwareHandle::seek(float position)
 	if(!m_status)
 		return false;
 
+	m_pitch->setPitch(m_user_pitch);
 	m_reader->seek((int)(position * m_reader->getSpecs().rate));
 
 	if(m_status == AUD_STATUS_STOPPED)
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 79c1c3e3e82..806a8660e8c 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -22,6 +22,7 @@ if(WITH_CYCLES_NATIVE_ONLY)
 		-DWITH_KERNEL_NATIVE
 	)
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+	set(CYCLES_KERNEL_FLAGS "-march=native")
 elseif(NOT WITH_CPU_SSE)
 	set(CXX_HAS_SSE FALSE)
 	set(CXX_HAS_AVX FALSE)
@@ -59,10 +60,13 @@ elseif(WIN32 AND MSVC)
 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Ox")
 	set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /Ox")
 	set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /Ox")
+
+	set(CYCLES_KERNEL_FLAGS "/fp:fast -D_CRT_SECURE_NO_WARNINGS /GS-")
 elseif(CMAKE_COMPILER_IS_GNUCC)
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
 	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
+	set(CYCLES_KERNEL_FLAGS "-ffast-math")
 	if(CXX_HAS_SSE)
 		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse")
@@ -74,10 +78,12 @@ elseif(CMAKE_COMPILER_IS_GNUCC)
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c -mfpmath=sse")
 	endif()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
 elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	check_cxx_compiler_flag(-msse CXX_HAS_SSE)
 	check_cxx_compiler_flag(-mavx CXX_HAS_AVX)
 	check_cxx_compiler_flag(-mavx2 CXX_HAS_AVX2)
+	set(CYCLES_KERNEL_FLAGS "-ffast-math")
 	if(CXX_HAS_SSE)
 		set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2")
 		set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3")
@@ -89,6 +95,7 @@ elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 	if(CXX_HAS_AVX2)
 		set(CYCLES_AVX2_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mfma -mlzcnt -mbmi -mbmi2 -mf16c")
 	endif()
+	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -fno-finite-math-only")
 endif()
 
 if(CXX_HAS_SSE)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 8cd499b7ca6..aabb8f63640 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -1,14 +1,6 @@
 
 set(INC
-	.
-	../bvh
-	../device
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../subd
-	../util
+	..
 )
 set(INC_SYS
 )
diff --git a/intern/cycles/app/cycles_server.cpp b/intern/cycles/app/cycles_server.cpp
index 4ef9cd070bb..e65b9d769e4 100644
--- a/intern/cycles/app/cycles_server.cpp
+++ b/intern/cycles/app/cycles_server.cpp
@@ -16,15 +16,15 @@
 
 #include <stdio.h>
 
-#include "device.h"
-
-#include "util_args.h"
-#include "util_foreach.h"
-#include "util_path.h"
-#include "util_stats.h"
-#include "util_string.h"
-#include "util_task.h"
-#include "util_logging.h"
+#include "device/device.h"
+
+#include "util/util_args.h"
+#include "util/util_foreach.h"
+#include "util/util_path.h"
+#include "util/util_stats.h"
+#include "util/util_string.h"
+#include "util/util_task.h"
+#include "util/util_logging.h"
 
 using namespace ccl;
 
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 9816d614a7c..0cd249f0d84 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -16,29 +16,29 @@
 
 #include <stdio.h>
 
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "scene.h"
-#include "session.h"
-#include "integrator.h"
-
-#include "util_args.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_progress.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_transform.h"
-#include "util_version.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/integrator.h"
+
+#include "util/util_args.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_transform.h"
+#include "util/util_version.h"
 
 #ifdef WITH_CYCLES_STANDALONE_GUI
-#include "util_view.h"
+#include "util/util_view.h"
 #endif
 
-#include "cycles_xml.h"
+#include "app/cycles_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 35a30ae683f..04f00ef0e10 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -20,31 +20,31 @@
 #include <algorithm>
 #include <iterator>
 
-#include "node_xml.h"
-
-#include "background.h"
-#include "camera.h"
-#include "film.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "osl.h"
-#include "shader.h"
-#include "scene.h"
-
-#include "subd_patch.h"
-#include "subd_split.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_path.h"
-#include "util_transform.h"
-#include "util_xml.h"
-
-#include "cycles_xml.h"
+#include "graph/node_xml.h"
+
+#include "render/background.h"
+#include "render/camera.h"
+#include "render/film.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/shader.h"
+#include "render/scene.h"
+
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_path.h"
+#include "util/util_transform.h"
+#include "util/util_xml.h"
+
+#include "app/cycles_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index b57502b3b14..ae4977aaed0 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	../graph
-	../render
-	../device
-	../kernel
-	../kernel/svm
-	../util
-	../subd
+	..
 	../../glew-mx
 	../../guardedalloc
 	../../mikktspace
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index 235d19e91e8..eb792af7264 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -107,7 +107,13 @@ def engine_exit():
     engine.exit()
 
 
+classes = (
+    CyclesRender,
+)
+
+
 def register():
+    from bpy.utils import register_class
     from . import ui
     from . import properties
     from . import presets
@@ -122,12 +128,15 @@ def register():
     properties.register()
     ui.register()
     presets.register()
-    bpy.utils.register_module(__name__)
+
+    for cls in classes:
+        register_class(cls)
 
     bpy.app.handlers.version_update.append(version_update.do_versions)
 
 
 def unregister():
+    from bpy.utils import unregister_class
     from . import ui
     from . import properties
     from . import presets
@@ -138,4 +147,6 @@ def unregister():
     ui.unregister()
     properties.unregister()
     presets.unregister()
-    bpy.utils.unregister_module(__name__)
+
+    for cls in classes:
+        unregister_class(cls)
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index c8c9ef58c52..ab57dd44bdb 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -50,6 +50,24 @@ def _workaround_buggy_drivers():
             _cycles.opencl_disable()
 
 
+def _configure_argument_parser():
+    import argparse
+    parser = argparse.ArgumentParser(description="Cycles Addon argument parser")
+    parser.add_argument("--cycles-resumable-num-chunks",
+                        help="Number of chunks to split sample range into",
+                        default=None)
+    parser.add_argument("--cycles-resumable-current-chunk",
+                        help="Current chunk of samples range to render",
+                        default=None)
+    parser.add_argument("--cycles-resumable-start-chunk",
+                        help="Start chunk to render",
+                        default=None)
+    parser.add_argument("--cycles-resumable-end-chunk",
+                        help="End chunk to render",
+                        default=None)
+    return parser
+
+
 def _parse_command_line():
     import sys
 
@@ -57,25 +75,22 @@ def _parse_command_line():
     if "--" not in argv:
         return
 
-    argv = argv[argv.index("--") + 1:]
-
-    num_resumable_chunks = None
-    current_resumable_chunk = None
-
-    # TODO(sergey): Add some nice error prints if argument is not used properly.
-    idx = 0
-    while idx < len(argv) - 1:
-        arg = argv[idx]
-        if arg == '--cycles-resumable-num-chunks':
-            num_resumable_chunks = int(argv[idx + 1])
-        elif arg == '--cycles-resumable-current-chunk':
-            current_resumable_chunk = int(argv[idx + 1])
-        idx += 1
+    parser = _configure_argument_parser()
+    args, unknown = parser.parse_known_args(argv[argv.index("--") + 1:])
 
-    if num_resumable_chunks is not None and current_resumable_chunk is not None:
-        import _cycles
-        _cycles.set_resumable_chunks(num_resumable_chunks,
-                                     current_resumable_chunk)
+    if args.cycles_resumable_num_chunks is not None:
+        if args.cycles_resumable_current_chunk is not None:
+            import _cycles
+            _cycles.set_resumable_chunk(
+                    int(args.cycles_resumable_num_chunks),
+                    int(args.cycles_resumable_current_chunk))
+        elif args.cycles_resumable_start_chunk is not None and \
+             args.cycles_resumable_end_chunk:
+            import _cycles
+            _cycles.set_resumable_chunk_range(
+                    int(args.cycles_resumable_num_chunks),
+                    int(args.cycles_resumable_start_chunk),
+                    int(args.cycles_resumable_end_chunk))
 
 
 def init():
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index f97b51b629d..82c4ffc6e50 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -82,12 +82,23 @@ class AddPresetSampling(AddPresetBase, Operator):
     preset_subdir = "cycles/sampling"
 
 
+classes = (
+    AddPresetIntegrator,
+    AddPresetSampling,
+)
+
+
 def register():
-    pass
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
 
 
 def unregister():
-    pass
+    from bpy.utils import unregister_class
+    for cls in classes:
+        unregister_class(cls)
+
 
 if __name__ == "__main__":
     register()
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 5c51f9afc28..cbf469b3a89 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -665,8 +665,10 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         cls.debug_use_cpu_sse3 = BoolProperty(name="SSE3", default=True)
         cls.debug_use_cpu_sse2 = BoolProperty(name="SSE2", default=True)
         cls.debug_use_qbvh = BoolProperty(name="QBVH", default=True)
+        cls.debug_use_cpu_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
         cls.debug_use_cuda_adaptive_compile = BoolProperty(name="Adaptive Compile", default=False)
+        cls.debug_use_cuda_split_kernel = BoolProperty(name="Split Kernel", default=False)
 
         cls.debug_opencl_kernel_type = EnumProperty(
             name="OpenCL Kernel Type",
@@ -693,6 +695,8 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
             update=devices_update_callback
             )
 
+        cls.debug_opencl_kernel_single_program = BoolProperty(name="Single Program", default=False, update=devices_update_callback);
+
         cls.debug_use_opencl_debug = BoolProperty(name="Debug OpenCL", default=False)
 
     @classmethod
@@ -1092,6 +1096,12 @@ class CyclesObjectSettings(bpy.types.PropertyGroup):
                 default=1.0,
                 )
 
+        cls.is_shadow_catcher = BoolProperty(
+                name="Shadow Catcher",
+                description="Only render shadows on this object, for compositing renders into real footage",
+                default=False,
+                )
+
     @classmethod
     def unregister(cls):
         del bpy.types.Object.cycles
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 44af5f7efed..2b50d272be8 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -86,12 +86,10 @@ def use_sample_all_lights(context):
 
     return cscene.sample_all_lights_direct or cscene.sample_all_lights_indirect
 
-def show_device_selection(context):
-    type = get_device_type(context)
-    if type == 'NETWORK':
+def show_device_active(context):
+    cscene = context.scene.cycles
+    if cscene.device != 'GPU':
         return True
-    if not type in {'CUDA', 'OPENCL'}:
-        return False
     return context.user_preferences.addons[__package__].preferences.has_active_device()
 
 
@@ -186,9 +184,6 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.label(text="AA Samples:")
             sub.prop(cscene, "aa_samples", text="Render")
             sub.prop(cscene, "preview_aa_samples", text="Preview")
-            sub.separator()
-            sub.prop(cscene, "sample_all_lights_direct")
-            sub.prop(cscene, "sample_all_lights_indirect")
 
             col = split.column()
             sub = col.column(align=True)
@@ -205,6 +200,10 @@ class CyclesRender_PT_sampling(CyclesButtonsPanel, Panel):
             sub.prop(cscene, "subsurface_samples", text="Subsurface")
             sub.prop(cscene, "volume_samples", text="Volume")
 
+            col = layout.column(align=True)
+            col.prop(cscene, "sample_all_lights_direct")
+            col.prop(cscene, "sample_all_lights_indirect")
+
         if not (use_opencl(context) and cscene.feature_set != 'EXPERIMENTAL'):
             layout.row().prop(cscene, "sampling_pattern", text="Pattern")
 
@@ -270,7 +269,7 @@ class CyclesRender_PT_geometry(CyclesButtonsPanel, Panel):
 
         row = col.row()
         row.prop(ccscene, "minimum_width", text="Min Pixels")
-        row.prop(ccscene, "maximum_width", text="Max Ext.")
+        row.prop(ccscene, "maximum_width", text="Max Extension")
 
 
 class CyclesRender_PT_light_paths(CyclesButtonsPanel, Panel):
@@ -788,6 +787,8 @@ class CyclesObject_PT_cycles_settings(CyclesButtonsPanel, Panel):
         if ob.type != 'LAMP':
             flow.prop(visibility, "shadow")
 
+        layout.prop(cob, "is_shadow_catcher")
+
         col = layout.column()
         col.label(text="Performance:")
         row = col.row()
@@ -1518,15 +1519,18 @@ class CyclesRender_PT_debug(CyclesButtonsPanel, Panel):
         row.prop(cscene, "debug_use_cpu_avx", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
         col.prop(cscene, "debug_use_qbvh")
+        col.prop(cscene, "debug_use_cpu_split_kernel")
 
         col = layout.column()
         col.label('CUDA Flags:')
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
+        col.prop(cscene, "debug_use_cuda_split_kernel")
 
         col = layout.column()
         col.label('OpenCL Flags:')
         col.prop(cscene, "debug_opencl_kernel_type", text="Kernel")
         col.prop(cscene, "debug_opencl_device_type", text="Device")
+        col.prop(cscene, "debug_opencl_kernel_single_program", text="Single Program")
         col.prop(cscene, "debug_use_opencl_debug", text="Debug")
 
 
@@ -1633,7 +1637,7 @@ def draw_device(self, context):
         split = layout.split(percentage=1/3)
         split.label("Device:")
         row = split.row()
-        row.active = show_device_selection(context)
+        row.active = show_device_active(context)
         row.prop(cscene, "device", text="")
 
         if engine.with_osl() and use_cpu(context):
@@ -1712,17 +1716,75 @@ def get_panels():
 
     return panels
 
+
+classes = (
+    CYCLES_MT_sampling_presets,
+    CYCLES_MT_integrator_presets,
+    CyclesRender_PT_sampling,
+    CyclesRender_PT_geometry,
+    CyclesRender_PT_light_paths,
+    CyclesRender_PT_motion_blur,
+    CyclesRender_PT_film,
+    CyclesRender_PT_performance,
+    CyclesRender_PT_layer_options,
+    CyclesRender_PT_layer_passes,
+    CyclesRender_PT_views,
+    Cycles_PT_post_processing,
+    CyclesCamera_PT_dof,
+    Cycles_PT_context_material,
+    CyclesObject_PT_motion_blur,
+    CyclesObject_PT_cycles_settings,
+    CYCLES_OT_use_shading_nodes,
+    CyclesLamp_PT_preview,
+    CyclesLamp_PT_lamp,
+    CyclesLamp_PT_nodes,
+    CyclesLamp_PT_spot,
+    CyclesWorld_PT_preview,
+    CyclesWorld_PT_surface,
+    CyclesWorld_PT_volume,
+    CyclesWorld_PT_ambient_occlusion,
+    CyclesWorld_PT_mist,
+    CyclesWorld_PT_ray_visibility,
+    CyclesWorld_PT_settings,
+    CyclesMaterial_PT_preview,
+    CyclesMaterial_PT_surface,
+    CyclesMaterial_PT_volume,
+    CyclesMaterial_PT_displacement,
+    CyclesMaterial_PT_settings,
+    CyclesTexture_PT_context,
+    CyclesTexture_PT_node,
+    CyclesTexture_PT_mapping,
+    CyclesTexture_PT_colors,
+    CyclesParticle_PT_textures,
+    CyclesRender_PT_bake,
+    CyclesRender_PT_debug,
+    CyclesParticle_PT_CurveSettings,
+    CyclesScene_PT_simplify,
+)
+
+
 def register():
+    from bpy.utils import register_class
+
     bpy.types.RENDER_PT_render.append(draw_device)
     bpy.types.VIEW3D_HT_header.append(draw_pause)
 
     for panel in get_panels():
         panel.COMPAT_ENGINES.add('CYCLES')
 
+    for cls in classes:
+        register_class(cls)
+
+
 def unregister():
+    from bpy.utils import unregister_class
+
     bpy.types.RENDER_PT_render.remove(draw_device)
     bpy.types.VIEW3D_HT_header.remove(draw_pause)
 
     for panel in get_panels():
         if 'CYCLES' in panel.COMPAT_ENGINES:
             panel.COMPAT_ENGINES.remove('CYCLES')
+
+    for cls in classes:
+        unregister_class(cls)
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index f02fc553908..40d6b25f2b7 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "scene.h"
+#include "render/camera.h"
+#include "render/scene.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_logging.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp
index e42ff5d72a6..6fa038e8bf0 100644
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#include "attribute.h"
-#include "camera.h"
-#include "curves.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
+#include "render/attribute.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -411,6 +411,7 @@ static void ExportCurveTrianglePlanes(Mesh *mesh, ParticleCurveData *CData,
 		}
 	}
 
+	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -434,8 +435,8 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 			if(CData->curve_keynum[curve] <= 1 || CData->curve_length[curve] == 0.0f)
 				continue;
 
-			numverts += (CData->curve_keynum[curve] - 2)*2*resolution + resolution;
-			numtris += (CData->curve_keynum[curve] - 2)*resolution;
+			numverts += (CData->curve_keynum[curve] - 1)*resolution + resolution;
+			numtris += (CData->curve_keynum[curve] - 1)*2*resolution;
 		}
 	}
 
@@ -545,6 +546,7 @@ static void ExportCurveTriangleGeometry(Mesh *mesh,
 		}
 	}
 
+	mesh->resize_mesh(mesh->verts.size(), mesh->triangles.size());
 	mesh->attributes.remove(ATTR_STD_VERTEX_NORMAL);
 	mesh->attributes.remove(ATTR_STD_FACE_NORMAL);
 	mesh->add_face_normals();
@@ -890,7 +892,7 @@ void BlenderSync::sync_curves(Mesh *mesh,
 	}
 
 	/* obtain general settings */
-	bool use_curves = scene->curve_system_manager->use_curves;
+	const bool use_curves = scene->curve_system_manager->use_curves;
 
 	if(!(use_curves && b_ob.mode() != b_ob.mode_PARTICLE_EDIT)) {
 		if(!motion)
@@ -898,11 +900,11 @@ void BlenderSync::sync_curves(Mesh *mesh,
 		return;
 	}
 
-	int primitive = scene->curve_system_manager->primitive;
-	int triangle_method = scene->curve_system_manager->triangle_method;
-	int resolution = scene->curve_system_manager->resolution;
-	size_t vert_num = mesh->verts.size();
-	size_t tri_num = mesh->num_triangles();
+	const int primitive = scene->curve_system_manager->primitive;
+	const int triangle_method = scene->curve_system_manager->triangle_method;
+	const int resolution = scene->curve_system_manager->resolution;
+	const size_t vert_num = mesh->verts.size();
+	const size_t tri_num = mesh->num_triangles();
 	int used_res = 1;
 
 	/* extract particle hair data - should be combined with connecting to mesh later*/
diff --git a/intern/cycles/blender/blender_logging.cpp b/intern/cycles/blender/blender_logging.cpp
index f4f86929168..d0f82e37662 100644
--- a/intern/cycles/blender/blender_logging.cpp
+++ b/intern/cycles/blender/blender_logging.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "CCL_api.h"
-#include "util_logging.h"
+#include "blender/CCL_api.h"
+#include "util/util_logging.h"
 
 void CCL_init_logging(const char *argv0)
 {
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 85117cfff7b..e0e89cec65c 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -15,21 +15,22 @@
  */
 
  
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "camera.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/camera.h"
 
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
 
-#include "subd_patch.h"
-#include "subd_split.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_math.h"
+#include "util/util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
 
 #include "mikktspace.h"
 
@@ -525,69 +526,177 @@ static void attr_create_uv_map(Scene *scene,
 }
 
 /* Create vertex pointiness attributes. */
+
+/* Compare vertices by sum of their coordinates. */
+class VertexAverageComparator {
+public:
+	VertexAverageComparator(const array<float3>& verts)
+	        : verts_(verts) {
+	}
+
+	bool operator()(const int& vert_idx_a, const int& vert_idx_b)
+	{
+		const float3 &vert_a = verts_[vert_idx_a];
+		const float3 &vert_b = verts_[vert_idx_b];
+		if(vert_a == vert_b) {
+			/* Special case for doubles, so we ensure ordering. */
+			return vert_idx_a > vert_idx_b;
+		}
+		const float x1 = vert_a.x + vert_a.y + vert_a.z;
+		const float x2 = vert_b.x + vert_b.y + vert_b.z;
+		return x1 < x2;
+	}
+
+protected:
+	const array<float3>& verts_;
+};
+
 static void attr_create_pointiness(Scene *scene,
                                    Mesh *mesh,
                                    BL::Mesh& b_mesh,
                                    bool subdivision)
 {
-	if(mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
-		const int numverts = b_mesh.vertices.length();
-		AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
-		Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
-		float *data = attr->data_float();
-		int *counter = new int[numverts];
-		float *raw_data = new float[numverts];
-		float3 *edge_accum = new float3[numverts];
-
-		/* Calculate pointiness using single ring neighborhood. */
-		memset(counter, 0, sizeof(int) * numverts);
-		memset(raw_data, 0, sizeof(float) * numverts);
-		memset(edge_accum, 0, sizeof(float3) * numverts);
-		BL::Mesh::edges_iterator e;
-		int i = 0;
-		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
-			int v0 = b_mesh.edges[i].vertices()[0],
-			    v1 = b_mesh.edges[i].vertices()[1];
-			float3 co0 = get_float3(b_mesh.vertices[v0].co()),
-			       co1 = get_float3(b_mesh.vertices[v1].co());
-			float3 edge = normalize(co1 - co0);
-			edge_accum[v0] += edge;
-			edge_accum[v1] += -edge;
-			++counter[v0];
-			++counter[v1];
-		}
-		i = 0;
-		BL::Mesh::vertices_iterator v;
-		for(b_mesh.vertices.begin(v); v != b_mesh.vertices.end(); ++v, ++i) {
-			if(counter[i] > 0) {
-				float3 normal = get_float3(b_mesh.vertices[i].normal());
-				float angle = safe_acosf(dot(normal, edge_accum[i] / counter[i]));
-				raw_data[i] = angle * M_1_PI_F;
+	if(!mesh->need_attribute(scene, ATTR_STD_POINTINESS)) {
+		return;
+	}
+	const int num_verts = b_mesh.vertices.length();
+	/* STEP 1: Find out duplicated vertices and point duplicates to a single
+	 *         original vertex.
+	 */
+	vector<int> sorted_vert_indeices(num_verts);
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		sorted_vert_indeices[vert_index] = vert_index;
+	}
+	VertexAverageComparator compare(mesh->verts);
+	sort(sorted_vert_indeices.begin(), sorted_vert_indeices.end(), compare);
+	/* This array stores index of the original vertex for the given vertex
+	 * index.
+	 */
+	vector<int> vert_orig_index(num_verts);
+	for(int sorted_vert_index = 0;
+	    sorted_vert_index < num_verts;
+	    ++sorted_vert_index)
+	{
+		const int vert_index = sorted_vert_indeices[sorted_vert_index];
+		const float3 &vert_co = mesh->verts[vert_index];
+		bool found = false;
+		for(int other_sorted_vert_index = sorted_vert_index + 1;
+		    other_sorted_vert_index < num_verts;
+		    ++other_sorted_vert_index)
+		{
+			const int other_vert_index =
+			        sorted_vert_indeices[other_sorted_vert_index];
+			const float3 &other_vert_co = mesh->verts[other_vert_index];
+			/* We are too far away now, we wouldn't have duplicate. */
+			if((other_vert_co.x + other_vert_co.y + other_vert_co.z) -
+			   (vert_co.x + vert_co.y + vert_co.z) > 3 * FLT_EPSILON)
+			{
+				break;
 			}
-			else {
-				raw_data[i] = 0.0f;
+			/* Found duplicate. */
+			if(len_squared(other_vert_co - vert_co) < FLT_EPSILON) {
+				found = true;
+				vert_orig_index[vert_index] = other_vert_index;
+				break;
 			}
 		}
-
-		/* Blur vertices to approximate 2 ring neighborhood. */
-		memset(counter, 0, sizeof(int) * numverts);
-		memcpy(data, raw_data, sizeof(float) * numverts);
-		i = 0;
-		for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++i) {
-			int v0 = b_mesh.edges[i].vertices()[0],
-			    v1 = b_mesh.edges[i].vertices()[1];
-			data[v0] += raw_data[v1];
-			data[v1] += raw_data[v0];
-			++counter[v0];
-			++counter[v1];
+		if(!found) {
+			vert_orig_index[vert_index] = vert_index;
+		}
+	}
+	/* Make sure we always points to the very first orig vertex. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		int orig_index = vert_orig_index[vert_index];
+		while(orig_index != vert_orig_index[orig_index]) {
+			orig_index = vert_orig_index[orig_index];
 		}
-		for(i = 0; i < numverts; ++i) {
-			data[i] /= counter[i] + 1;
+		vert_orig_index[vert_index] = orig_index;
+	}
+	sorted_vert_indeices.free_memory();
+	/* STEP 2: Calculate vertex normals taking into account their possible
+	 *         duplicates which gets "welded" together.
+	 */
+	vector<float3> vert_normal(num_verts, make_float3(0.0f, 0.0f, 0.0f));
+	/* First we accumulate all vertex normals in the original index. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const float3 normal = get_float3(b_mesh.vertices[vert_index].normal());
+		const int orig_index = vert_orig_index[vert_index];
+		vert_normal[orig_index] += normal;
+	}
+	/* Then we normalize the accumulated result and flush it to all duplicates
+	 * as well.
+	 */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		vert_normal[vert_index] = normalize(vert_normal[orig_index]);
+	}
+	/* STEP 3: Calculate pointiness using single ring neighborhood. */
+	vector<int> counter(num_verts, 0);
+	vector<float> raw_data(num_verts, 0.0f);
+	vector<float3> edge_accum(num_verts, make_float3(0.0f, 0.0f, 0.0f));
+	BL::Mesh::edges_iterator e;
+	EdgeMap visited_edges;
+	int edge_index = 0;
+	memset(&counter[0], 0, sizeof(int) * counter.size());
+	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) {
+		const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]],
+		          v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]];
+		if(visited_edges.exists(v0, v1)) {
+			continue;
+		}
+		visited_edges.insert(v0, v1);
+		float3 co0 = get_float3(b_mesh.vertices[v0].co()),
+		       co1 = get_float3(b_mesh.vertices[v1].co());
+		float3 edge = normalize(co1 - co0);
+		edge_accum[v0] += edge;
+		edge_accum[v1] += -edge;
+		++counter[v0];
+		++counter[v1];
+	}
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		if(orig_index != vert_index) {
+			/* Skip duplicates, they'll be overwritten later on. */
+			continue;
+		}
+		if(counter[vert_index] > 0) {
+			const float3 normal = vert_normal[vert_index];
+			const float angle =
+			        safe_acosf(dot(normal,
+			                       edge_accum[vert_index] / counter[vert_index]));
+			raw_data[vert_index] = angle * M_1_PI_F;
+		}
+		else {
+			raw_data[vert_index] = 0.0f;
 		}
-
-		delete [] counter;
-		delete [] raw_data;
-		delete [] edge_accum;
+	}
+	/* STEP 3: Blur vertices to approximate 2 ring neighborhood. */
+	AttributeSet& attributes = (subdivision)? mesh->subd_attributes: mesh->attributes;
+	Attribute *attr = attributes.add(ATTR_STD_POINTINESS);
+	float *data = attr->data_float();
+	memcpy(data, &raw_data[0], sizeof(float) * raw_data.size());
+	memset(&counter[0], 0, sizeof(int) * counter.size());
+	edge_index = 0;
+	visited_edges.clear();
+	for(b_mesh.edges.begin(e); e != b_mesh.edges.end(); ++e, ++edge_index) {
+		const int v0 = vert_orig_index[b_mesh.edges[edge_index].vertices()[0]],
+		          v1 = vert_orig_index[b_mesh.edges[edge_index].vertices()[1]];
+		if(visited_edges.exists(v0, v1)) {
+			continue;
+		}
+		visited_edges.insert(v0, v1);
+		data[v0] += raw_data[v1];
+		data[v1] += raw_data[v0];
+		++counter[v0];
+		++counter[v1];
+	}
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		data[vert_index] /= counter[vert_index] + 1;
+	}
+	/* STEP 4: Copy attribute to the duplicated vertices. */
+	for(int vert_index = 0; vert_index < num_verts; ++vert_index) {
+		const int orig_index = vert_orig_index[vert_index];
+		data[vert_index] = data[orig_index];
 	}
 }
 
@@ -656,9 +765,6 @@ static void create_mesh(Scene *scene,
 			generated[i++] = get_float3(v->undeformed_co())*size - loc;
 	}
 
-	/* Create needed vertex attributes. */
-	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
-
 	/* create faces */
 	vector<int> nverts(numfaces);
 	vector<int> face_flags(numfaces, FACE_FLAG_NONE);
@@ -671,6 +777,15 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(f->material_index(), 0, used_shaders.size()-1);
 			bool smooth = f->use_smooth() || use_loop_normals;
 
+			if(use_loop_normals) {
+				BL::Array<float, 12> loop_normals = f->split_normals();
+				for(int i = 0; i < n; i++) {
+					N[vi[i]] = make_float3(loop_normals[i * 3],
+					                       loop_normals[i * 3 + 1],
+					                       loop_normals[i * 3 + 2]);
+				}
+			}
+
 			/* Create triangles.
 			 *
 			 * NOTE: Autosmooth is already taken care about.
@@ -704,7 +819,7 @@ static void create_mesh(Scene *scene,
 			int shader = clamp(p->material_index(), 0, used_shaders.size()-1);
 			bool smooth = p->use_smooth() || use_loop_normals;
 
-			vi.reserve(n);
+			vi.resize(n);
 			for(int i = 0; i < n; i++) {
 				/* NOTE: Autosmooth is already taken care about. */
 				vi[i] = b_mesh.loops[p->loop_start() + i].vertex_index();
@@ -718,6 +833,7 @@ static void create_mesh(Scene *scene,
 	/* Create all needed attributes.
 	 * The calculate functions will check whether they're needed or not.
 	 */
+	attr_create_pointiness(scene, mesh, b_mesh, subdivision);
 	attr_create_vertex_color(scene, mesh, b_mesh, nverts, face_flags, subdivision);
 	attr_create_uv_map(scene, mesh, b_mesh, nverts, face_flags, subdivision, subdivide_uvs);
 
@@ -1178,4 +1294,3 @@ void BlenderSync::sync_mesh_motion(BL::Object& b_ob,
 }
 
 CCL_NAMESPACE_END
-
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 637cf7abda8..d05699236cc 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -14,24 +14,24 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "integrator.h"
-#include "graph.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "nodes.h"
-#include "particles.h"
-#include "shader.h"
-
-#include "blender_object_cull.h"
-#include "blender_sync.h"
-#include "blender_util.h"
-
-#include "util_foreach.h"
-#include "util_hash.h"
-#include "util_logging.h"
+#include "render/camera.h"
+#include "render/integrator.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/nodes.h"
+#include "render/particles.h"
+#include "render/shader.h"
+
+#include "blender/blender_object_cull.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -343,6 +343,13 @@ Object *BlenderSync::sync_object(BL::Object& b_parent,
 		object_updated = true;
 	}
 
+	PointerRNA cobject = RNA_pointer_get(&b_ob.ptr, "cycles");
+	bool is_shadow_catcher = get_boolean(cobject, "is_shadow_catcher");
+	if(is_shadow_catcher != object->is_shadow_catcher) {
+		object->is_shadow_catcher = is_shadow_catcher;
+		object_updated = true;
+	}
+
 	/* object sync
 	 * transform comparison should not be needed, but duplis don't work perfect
 	 * in the depsgraph and may not signal changes, so this is a workaround */
diff --git a/intern/cycles/blender/blender_object_cull.cpp b/intern/cycles/blender/blender_object_cull.cpp
index 08918dd1a49..0333c027f70 100644
--- a/intern/cycles/blender/blender_object_cull.cpp
+++ b/intern/cycles/blender/blender_object_cull.cpp
@@ -16,9 +16,9 @@
 
 #include <cstdlib>
 
-#include "camera.h"
+#include "render/camera.h"
 
-#include "blender_object_cull.h"
+#include "blender/blender_object_cull.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_object_cull.h b/intern/cycles/blender/blender_object_cull.h
index b6f0ca5cd31..2147877a860 100644
--- a/intern/cycles/blender/blender_object_cull.h
+++ b/intern/cycles/blender/blender_object_cull.h
@@ -17,8 +17,8 @@
 #ifndef __BLENDER_OBJECT_CULL_H__
 #define __BLENDER_OBJECT_CULL_H__
 
-#include "blender_sync.h"
-#include "util_types.h"
+#include "blender/blender_sync.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_particles.cpp b/intern/cycles/blender/blender_particles.cpp
index dd2900a8d5b..00f8cb3cf1b 100644
--- a/intern/cycles/blender/blender_particles.cpp
+++ b/intern/cycles/blender/blender_particles.cpp
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "object.h"
-#include "particles.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/particles.h"
 
-#include "blender_sync.h"
-#include "blender_util.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
 
-#include "util_foreach.h"
+#include "util/util_foreach.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 438abc49f88..d509e9de981 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -16,21 +16,21 @@
 
 #include <Python.h>
 
-#include "CCL_api.h"
+#include "blender/CCL_api.h"
 
-#include "blender_sync.h"
-#include "blender_session.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_types.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
 
 #ifdef WITH_OSL
-#include "osl.h"
+#include "render/osl.h"
 
 #include <OSL/oslquery.h>
 #include <OSL/oslconfig.h>
@@ -67,8 +67,10 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
 	flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
 	flags.cpu.qbvh = get_boolean(cscene, "debug_use_qbvh");
+	flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
 	/* Synchronize CUDA flags. */
 	flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
+	flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
 	/* Synchronize OpenCL kernel type. */
 	switch(get_enum(cscene, "debug_opencl_kernel_type")) {
 		case 0:
@@ -104,6 +106,7 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
 	}
 	/* Synchronize other OpenCL flags. */
 	flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
+	flags.opencl.single_program = get_boolean(cscene, "debug_opencl_kernel_single_program");
 	return flags.opencl.device_type != opencl_device_type ||
 	       flags.opencl.kernel_type != opencl_kernel_type;
 }
@@ -641,7 +644,7 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
 	Py_RETURN_NONE;
 }
 
-static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
+static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
 {
 	int num_resumable_chunks, current_resumable_chunk;
 	if(!PyArg_ParseTuple(args, "ii",
@@ -676,6 +679,53 @@ static PyObject *set_resumable_chunks_func(PyObject * /*self*/, PyObject *args)
 	Py_RETURN_NONE;
 }
 
+static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
+{
+	int num_chunks, start_chunk, end_chunk;
+	if(!PyArg_ParseTuple(args, "iii",
+	                     &num_chunks,
+	                     &start_chunk,
+	                     &end_chunk)) {
+		Py_RETURN_NONE;
+	}
+
+	if(num_chunks <= 0) {
+		fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(start_chunk < 1 || start_chunk > num_chunks) {
+		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(end_chunk < 1 || end_chunk > num_chunks) {
+		fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+	if(start_chunk > end_chunk) {
+		fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
+		abort();
+		Py_RETURN_NONE;
+	}
+
+	VLOG(1) << "Initialized resumable render: "
+	        << "num_resumable_chunks=" << num_chunks << ", "
+	        << "start_resumable_chunk=" << start_chunk
+	        << "end_resumable_chunk=" << end_chunk;
+	BlenderSession::num_resumable_chunks = num_chunks;
+	BlenderSession::start_resumable_chunk = start_chunk;
+	BlenderSession::end_resumable_chunk = end_chunk;
+
+	printf("Cycles: Will render chunks %d to %d of %d\n",
+	       start_chunk,
+	       end_chunk,
+	       num_chunks);
+
+	Py_RETURN_NONE;
+}
+
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
 	vector<DeviceInfo>& devices = Device::available_devices();
@@ -715,7 +765,8 @@ static PyMethodDef methods[] = {
 	{"debug_flags_reset", debug_flags_reset_func, METH_NOARGS, ""},
 
 	/* Resumable render */
-	{"set_resumable_chunks", set_resumable_chunks_func, METH_VARARGS, ""},
+	{"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
+	{"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
 
 	/* Compute Device selection */
 	{"get_device_types", get_device_types_func, METH_VARARGS, ""},
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 2f30cbd961f..26f9bccd95d 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -16,36 +16,38 @@
 
 #include <stdlib.h>
 
-#include "background.h"
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "session.h"
-#include "shader.h"
-
-#include "util_color.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_hash.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_time.h"
-
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
+#include "render/background.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/shader.h"
+
+#include "util/util_color.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_hash.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_time.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
 
 CCL_NAMESPACE_BEGIN
 
 bool BlenderSession::headless = false;
 int BlenderSession::num_resumable_chunks = 0;
 int BlenderSession::current_resumable_chunk = 0;
+int BlenderSession::start_resumable_chunk = 0;
+int BlenderSession::end_resumable_chunk = 0;
 
 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
                                BL::UserPreferences& b_userpref,
@@ -68,6 +70,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = true;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
+	last_status_time = 0.0;
 }
 
 BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
@@ -93,6 +96,7 @@ BlenderSession::BlenderSession(BL::RenderEngine& b_engine,
 	background = false;
 	last_redraw_time = 0.0;
 	start_resize_time = 0.0;
+	last_status_time = 0.0;
 }
 
 BlenderSession::~BlenderSession()
@@ -989,10 +993,14 @@ void BlenderSession::update_status_progress()
 	if(substatus.size() > 0)
 		status += " | " + substatus;
 
-	if(status != last_status) {
+	double current_time = time_dt();
+	/* When rendering in a window, redraw the status at least once per second to keep the elapsed and remaining time up-to-date.
+	 * For headless rendering, only report when something significant changes to keep the console output readable. */
+	if(status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
 		b_engine.update_stats("", (timestatus + scene + status).c_str());
 		b_engine.update_memory_stats(mem_used, mem_peak);
 		last_status = status;
+		last_status_time = current_time;
 	}
 	if(progress != last_progress) {
 		b_engine.update_progress(progress);
@@ -1342,9 +1350,21 @@ void BlenderSession::update_resumable_tile_manager(int num_samples)
 		return;
 	}
 
-	int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
-	int range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
-	int range_num_samples = num_samples_per_chunk;
+	const int num_samples_per_chunk = (int)ceilf((float)num_samples / num_resumable_chunks);
+
+	int range_start_sample, range_num_samples;
+	if(current_resumable_chunk != 0) {
+		/* Single chunk rendering. */
+		range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
+		range_num_samples = num_samples_per_chunk;
+	}
+	else {
+		/* Ranged-chunks. */
+		const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
+		range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
+		range_num_samples = num_chunks * num_samples_per_chunk;
+	}
+	/* Make sure we don't overshoot. */
 	if(range_start_sample + range_num_samples > num_samples) {
 		range_num_samples = num_samples - range_num_samples;
 	}
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index 82fe218b4ce..22b21a18f2e 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -17,12 +17,12 @@
 #ifndef __BLENDER_SESSION_H__
 #define __BLENDER_SESSION_H__
 
-#include "device.h"
-#include "scene.h"
-#include "session.h"
-#include "bake.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/bake.h"
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -113,6 +113,7 @@ public:
 	string last_status;
 	string last_error;
 	float last_progress;
+	double last_status_time;
 
 	int width, height;
 	double start_resize_time;
@@ -137,6 +138,10 @@ public:
 	/* Current resumable chunk index to render. */
 	static int current_resumable_chunk;
 
+	/* Alternative to single-chunk rendering to render a range of chunks. */
+	static int start_resumable_chunk;
+	static int end_resumable_chunk;
+
 protected:
 	void do_write_update_render_result(BL::RenderResult& b_rr,
 	                                   BL::RenderLayer& b_rlay,
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index c9d4236a7f2..1ec82445b20 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "graph.h"
-#include "light.h"
-#include "nodes.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "blender_texture.h"
-#include "blender_sync.h"
-#include "blender_util.h"
-
-#include "util_debug.h"
-#include "util_string.h"
+#include "render/background.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/nodes.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "blender/blender_texture.h"
+#include "blender/blender_sync.h"
+#include "blender/blender_util.h"
+
+#include "util/util_debug.h"
+#include "util/util_string.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -622,7 +623,8 @@ static ShaderNode *add_node(Scene *scene,
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
 			                  b_image.source() == BL::Image::source_MOVIE ||
-			                  b_engine.is_preview();
+			                  (b_engine.is_preview() &&
+			                   b_image.source() != BL::Image::source_SEQUENCE);
 
 			if(is_builtin) {
 				/* for builtin images we're using image datablock name to find an image to
@@ -675,7 +677,8 @@ static ShaderNode *add_node(Scene *scene,
 			bool is_builtin = b_image.packed_file() ||
 			                  b_image.source() == BL::Image::source_GENERATED ||
 			                  b_image.source() == BL::Image::source_MOVIE ||
-			                  b_engine.is_preview();
+			                  (b_engine.is_preview() &&
+			                   b_image.source() != BL::Image::source_SEQUENCE);
 
 			if(is_builtin) {
 				int scene_frame = b_scene.frame_current();
@@ -1168,6 +1171,13 @@ static void add_nodes(Scene *scene,
 
 /* Sync Materials */
 
+void BlenderSync::sync_materials_simpligy(Shader *shader)
+{
+	ShaderGraph *graph = shader->graph;
+	graph->simplify(scene);
+	shader->tag_update(scene);
+}
+
 void BlenderSync::sync_materials(bool update_all)
 {
 	shader_map.set_default(scene->default_surface);
@@ -1175,6 +1185,8 @@ void BlenderSync::sync_materials(bool update_all)
 	/* material loop */
 	BL::BlendData::materials_iterator b_mat;
 
+	TaskPool pool;
+
 	for(b_data.materials.begin(b_mat); b_mat != b_data.materials.end(); ++b_mat) {
 		Shader *shader;
 
@@ -1210,9 +1222,31 @@ void BlenderSync::sync_materials(bool update_all)
 			shader->displacement_method = (experimental) ? get_displacement_method(cmat) : DISPLACE_BUMP;
 
 			shader->set_graph(graph);
-			shader->tag_update(scene);
+
+			/* By simplifying the shader graph as soon as possible, some
+			 * redundant shader nodes might be removed which prevents loading
+			 * unnecessary attributes later.
+			 *
+			 * However, since graph simplification also accounts for e.g. mix
+			 * weight, this would cause frequent expensive resyncs in interactive
+			 * sessions, so for those sessions optimization is only performed
+			 * right before compiling.
+			 */
+			if(!preview) {
+				pool.push(function_bind(&BlenderSync::sync_materials_simpligy,
+				                        this,
+				                        shader));
+			}
+			else {
+				/* NOTE: Update tagging can access links which are being
+				 * optimized out.
+				 */
+				shader->tag_update(scene);
+			}
 		}
 	}
+
+	pool.wait_work();
 }
 
 /* Sync World */
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index d8043105cd8..3b071bf0e7d 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -14,29 +14,29 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "camera.h"
-#include "film.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
-#include "curves.h"
-
-#include "device.h"
-
-#include "blender_sync.h"
-#include "blender_session.h"
-#include "blender_util.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_opengl.h"
-#include "util_hash.h"
+#include "render/background.h"
+#include "render/camera.h"
+#include "render/film.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/curves.h"
+
+#include "device/device.h"
+
+#include "blender/blender_sync.h"
+#include "blender/blender_session.h"
+#include "blender/blender_util.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_opengl.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index 6984cbda259..62e2f8f563a 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -22,15 +22,15 @@
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
 
-#include "blender_util.h"
+#include "blender/blender_util.h"
 
-#include "scene.h"
-#include "session.h"
+#include "render/scene.h"
+#include "render/session.h"
 
-#include "util_map.h"
-#include "util_set.h"
-#include "util_transform.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -96,6 +96,7 @@ public:
 private:
 	/* sync */
 	void sync_lamps(bool update_all);
+	void sync_materials_simpligy(Shader *shader);
 	void sync_materials(bool update_all);
 	void sync_objects(BL::SpaceView3D& b_v3d, float motion_time = 0.0f);
 	void sync_motion(BL::RenderSettings& b_render,
diff --git a/intern/cycles/blender/blender_texture.cpp b/intern/cycles/blender/blender_texture.cpp
index 3807e683c7c..b2e27b76189 100644
--- a/intern/cycles/blender/blender_texture.cpp
+++ b/intern/cycles/blender/blender_texture.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "blender_texture.h"
+#include "blender/blender_texture.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_texture.h b/intern/cycles/blender/blender_texture.h
index ad96f9db8ed..734231a85ec 100644
--- a/intern/cycles/blender/blender_texture.h
+++ b/intern/cycles/blender/blender_texture.h
@@ -18,7 +18,7 @@
 #define __BLENDER_TEXTURE_H__
 
 #include <stdlib.h>
-#include "blender_sync.h"
+#include "blender/blender_sync.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h
index b67834cdea3..4d575330520 100644
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -17,14 +17,15 @@
 #ifndef __BLENDER_UTIL_H__
 #define __BLENDER_UTIL_H__
 
-#include "mesh.h"
+#include "render/mesh.h"
 
-#include "util_map.h"
-#include "util_path.h"
-#include "util_set.h"
-#include "util_transform.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_algorithm.h"
+#include "util/util_map.h"
+#include "util/util_path.h"
+#include "util/util_set.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 /* Hacks to hook into Blender API
  * todo: clean this up ... */
@@ -78,7 +79,7 @@ static inline BL::Mesh object_to_mesh(BL::BlendData& data,
 				me.calc_normals_split();
 			}
 			else {
-				me.split_faces();
+				me.split_faces(false);
 			}
 		}
 		if(subdivision_type == Mesh::SUBDIVISION_NONE) {
@@ -786,6 +787,35 @@ struct ParticleSystemKey {
 	}
 };
 
+class EdgeMap {
+public:
+	EdgeMap() {
+	}
+
+	void clear() {
+		edges_.clear();
+	}
+
+	void insert(int v0, int v1) {
+		get_sorted_verts(v0, v1);
+		edges_.insert(std::pair<int, int>(v0, v1));
+	}
+
+	bool exists(int v0, int v1) {
+		get_sorted_verts(v0, v1);
+		return edges_.find(std::pair<int, int>(v0, v1)) != edges_.end();
+	}
+
+protected:
+	void get_sorted_verts(int& v0, int& v1) {
+		if(v0 > v1) {
+			swap(v0, v1);
+		}
+	}
+
+	set< std::pair<int, int> > edges_;
+};
+
 CCL_NAMESPACE_END
 
 #endif /* __BLENDER_UTIL_H__ */
diff --git a/intern/cycles/bvh/CMakeLists.txt b/intern/cycles/bvh/CMakeLists.txt
index 92e48f0d87f..4701d75350a 100644
--- a/intern/cycles/bvh/CMakeLists.txt
+++ b/intern/cycles/bvh/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../util
-	../device
+	..
 )
 
 set(INC_SYS
diff --git a/intern/cycles/bvh/bvh.cpp b/intern/cycles/bvh/bvh.cpp
index 874a4246d1d..58348d16746 100644
--- a/intern/cycles/bvh/bvh.cpp
+++ b/intern/cycles/bvh/bvh.cpp
@@ -15,25 +15,25 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "curves.h"
-
-#include "bvh.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
-#include "bvh_params.h"
-#include "bvh_unaligned.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_math.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/curves.h"
+
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_params.h"
+#include "bvh/bvh_unaligned.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -67,7 +67,7 @@ BVH *BVH::create(const BVHParams& params, const vector<Object*>& objects)
 	if(params.use_qbvh)
 		return new QBVH(params, objects);
 	else
-		return new RegularBVH(params, objects);
+		return new BinaryBVH(params, objects);
 }
 
 /* Building */
@@ -81,6 +81,7 @@ void BVH::build(Progress& progress)
 	                   pack.prim_type,
 	                   pack.prim_index,
 	                   pack.prim_object,
+	                   pack.prim_time,
 	                   params,
 	                   progress);
 	BVHNode *root = bvh_build.run();
@@ -256,6 +257,10 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	pack.leaf_nodes.resize(leaf_nodes_size);
 	pack.object_node.resize(objects.size());
 
+	if(params.num_motion_curve_steps > 0 || params.num_motion_triangle_steps > 0) {
+		pack.prim_time.resize(prim_index_size);
+	}
+
 	int *pack_prim_index = (pack.prim_index.size())? &pack.prim_index[0]: NULL;
 	int *pack_prim_type = (pack.prim_type.size())? &pack.prim_type[0]: NULL;
 	int *pack_prim_object = (pack.prim_object.size())? &pack.prim_object[0]: NULL;
@@ -264,6 +269,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 	uint *pack_prim_tri_index = (pack.prim_tri_index.size())? &pack.prim_tri_index[0]: NULL;
 	int4 *pack_nodes = (pack.nodes.size())? &pack.nodes[0]: NULL;
 	int4 *pack_leaf_nodes = (pack.leaf_nodes.size())? &pack.leaf_nodes[0]: NULL;
+	float2 *pack_prim_time = (pack.prim_time.size())? &pack.prim_time[0]: NULL;
 
 	/* merge */
 	foreach(Object *ob, objects) {
@@ -309,6 +315,7 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 			int *bvh_prim_type = &bvh->pack.prim_type[0];
 			uint *bvh_prim_visibility = &bvh->pack.prim_visibility[0];
 			uint *bvh_prim_tri_index = &bvh->pack.prim_tri_index[0];
+			float2 *bvh_prim_time = bvh->pack.prim_time.size()? &bvh->pack.prim_time[0]: NULL;
 
 			for(size_t i = 0; i < bvh_prim_index_size; i++) {
 				if(bvh->pack.prim_type[i] & PRIMITIVE_ALL_CURVE) {
@@ -324,6 +331,9 @@ void BVH::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
 				pack_prim_type[pack_prim_index_offset] = bvh_prim_type[i];
 				pack_prim_visibility[pack_prim_index_offset] = bvh_prim_visibility[i];
 				pack_prim_object[pack_prim_index_offset] = 0;  // unused for instances
+				if(bvh_prim_time != NULL) {
+					pack_prim_time[pack_prim_index_offset] = bvh_prim_time[i];
+				}
 				pack_prim_index_offset++;
 			}
 		}
@@ -414,64 +424,64 @@ static bool node_bvh_is_unaligned(const BVHNode *node)
 {
 	const BVHNode *node0 = node->get_child(0),
 	              *node1 = node->get_child(1);
-	return node0->is_unaligned() || node1->is_unaligned();
+	return node0->is_unaligned || node1->is_unaligned;
 }
 
-RegularBVH::RegularBVH(const BVHParams& params_, const vector<Object*>& objects_)
+BinaryBVH::BinaryBVH(const BVHParams& params_, const vector<Object*>& objects_)
 : BVH(params_, objects_)
 {
 }
 
-void RegularBVH::pack_leaf(const BVHStackEntry& e,
-                           const LeafNode *leaf)
+void BinaryBVH::pack_leaf(const BVHStackEntry& e,
+                          const LeafNode *leaf)
 {
 	assert(e.idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
 	float4 data[BVH_NODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
 		/* object */
-		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].x = __int_as_float(~(leaf->lo));
 		data[0].y = __int_as_float(0);
 	}
 	else {
 		/* triangle */
-		data[0].x = __int_as_float(leaf->m_lo);
-		data[0].y = __int_as_float(leaf->m_hi);
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
 	}
-	data[0].z = __uint_as_float(leaf->m_visibility);
+	data[0].z = __uint_as_float(leaf->visibility);
 	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
 	}
 
 	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_NODE_LEAF_SIZE);
 }
 
-void RegularBVH::pack_inner(const BVHStackEntry& e,
-                            const BVHStackEntry& e0,
-                            const BVHStackEntry& e1)
+void BinaryBVH::pack_inner(const BVHStackEntry& e,
+                           const BVHStackEntry& e0,
+                           const BVHStackEntry& e1)
 {
-	if(e0.node->is_unaligned() || e1.node->is_unaligned()) {
+	if(e0.node->is_unaligned || e1.node->is_unaligned) {
 		pack_unaligned_inner(e, e0, e1);
 	} else {
 		pack_aligned_inner(e, e0, e1);
 	}
 }
 
-void RegularBVH::pack_aligned_inner(const BVHStackEntry& e,
-                                    const BVHStackEntry& e0,
-                                    const BVHStackEntry& e1)
+void BinaryBVH::pack_aligned_inner(const BVHStackEntry& e,
+                                   const BVHStackEntry& e0,
+                                   const BVHStackEntry& e1)
 {
 	pack_aligned_node(e.idx,
-	                  e0.node->m_bounds, e1.node->m_bounds,
+	                  e0.node->bounds, e1.node->bounds,
 	                  e0.encodeIdx(), e1.encodeIdx(),
-	                  e0.node->m_visibility, e1.node->m_visibility);
+	                  e0.node->visibility, e1.node->visibility);
 }
 
-void RegularBVH::pack_aligned_node(int idx,
-                                   const BoundBox& b0,
-                                   const BoundBox& b1,
-                                   int c0, int c1,
-                                   uint visibility0, uint visibility1)
+void BinaryBVH::pack_aligned_node(int idx,
+                                  const BoundBox& b0,
+                                  const BoundBox& b1,
+                                  int c0, int c1,
+                                  uint visibility0, uint visibility1)
 {
 	assert(idx + BVH_NODE_SIZE <= pack.nodes.size());
 	assert(c0 < 0 || c0 < pack.nodes.size());
@@ -498,26 +508,26 @@ void RegularBVH::pack_aligned_node(int idx,
 	memcpy(&pack.nodes[idx], data, sizeof(int4)*BVH_NODE_SIZE);
 }
 
-void RegularBVH::pack_unaligned_inner(const BVHStackEntry& e,
-                                      const BVHStackEntry& e0,
-                                      const BVHStackEntry& e1)
+void BinaryBVH::pack_unaligned_inner(const BVHStackEntry& e,
+                                     const BVHStackEntry& e0,
+                                     const BVHStackEntry& e1)
 {
 	pack_unaligned_node(e.idx,
 	                    e0.node->get_aligned_space(),
 	                    e1.node->get_aligned_space(),
-	                    e0.node->m_bounds,
-	                    e1.node->m_bounds,
+	                    e0.node->bounds,
+	                    e1.node->bounds,
 	                    e0.encodeIdx(), e1.encodeIdx(),
-	                    e0.node->m_visibility, e1.node->m_visibility);
+	                    e0.node->visibility, e1.node->visibility);
 }
 
-void RegularBVH::pack_unaligned_node(int idx,
-                                     const Transform& aligned_space0,
-                                     const Transform& aligned_space1,
-                                     const BoundBox& bounds0,
-                                     const BoundBox& bounds1,
-                                     int c0, int c1,
-                                     uint visibility0, uint visibility1)
+void BinaryBVH::pack_unaligned_node(int idx,
+                                    const Transform& aligned_space0,
+                                    const Transform& aligned_space1,
+                                    const BoundBox& bounds0,
+                                    const BoundBox& bounds1,
+                                    int c0, int c1,
+                                    uint visibility0, uint visibility1)
 {
 	assert(idx + BVH_UNALIGNED_NODE_SIZE <= pack.nodes.size());
 	assert(c0 < 0 || c0 < pack.nodes.size());
@@ -543,7 +553,7 @@ void RegularBVH::pack_unaligned_node(int idx,
 	memcpy(&pack.nodes[idx], data, sizeof(float4)*BVH_UNALIGNED_NODE_SIZE);
 }
 
-void RegularBVH::pack_nodes(const BVHNode *root)
+void BinaryBVH::pack_nodes(const BVHNode *root)
 {
 	const size_t num_nodes = root->getSubtreeSize(BVH_STAT_NODE_COUNT);
 	const size_t num_leaf_nodes = root->getSubtreeSize(BVH_STAT_LEAF_COUNT);
@@ -620,7 +630,7 @@ void RegularBVH::pack_nodes(const BVHNode *root)
 	pack.root_index = (root->is_leaf())? -1: 0;
 }
 
-void RegularBVH::refit_nodes()
+void BinaryBVH::refit_nodes()
 {
 	assert(!params.top_level);
 
@@ -629,7 +639,7 @@ void RegularBVH::refit_nodes()
 	refit_node(0, (pack.root_index == -1)? true: false, bbox, visibility);
 }
 
-void RegularBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
+void BinaryBVH::refit_node(int idx, bool leaf, BoundBox& bbox, uint& visibility)
 {
 	if(leaf) {
 		assert(idx + BVH_NODE_LEAF_SIZE <= pack.leaf_nodes.size());
@@ -759,18 +769,18 @@ static bool node_qbvh_is_unaligned(const BVHNode *node)
 	              *node1 = node->get_child(1);
 	bool has_unaligned = false;
 	if(node0->is_leaf()) {
-		has_unaligned |= node0->is_unaligned();
+		has_unaligned |= node0->is_unaligned;
 	}
 	else {
-		has_unaligned |= node0->get_child(0)->is_unaligned();
-		has_unaligned |= node0->get_child(1)->is_unaligned();
+		has_unaligned |= node0->get_child(0)->is_unaligned;
+		has_unaligned |= node0->get_child(1)->is_unaligned;
 	}
 	if(node1->is_leaf()) {
-		has_unaligned |= node1->is_unaligned();
+		has_unaligned |= node1->is_unaligned;
 	}
 	else {
-		has_unaligned |= node1->get_child(0)->is_unaligned();
-		has_unaligned |= node1->get_child(1)->is_unaligned();
+		has_unaligned |= node1->get_child(0)->is_unaligned;
+		has_unaligned |= node1->get_child(1)->is_unaligned;
 	}
 	return has_unaligned;
 }
@@ -785,19 +795,19 @@ void QBVH::pack_leaf(const BVHStackEntry& e, const LeafNode *leaf)
 {
 	float4 data[BVH_QNODE_LEAF_SIZE];
 	memset(data, 0, sizeof(data));
-	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->m_lo] == -1) {
+	if(leaf->num_triangles() == 1 && pack.prim_index[leaf->lo] == -1) {
 		/* object */
-		data[0].x = __int_as_float(~(leaf->m_lo));
+		data[0].x = __int_as_float(~(leaf->lo));
 		data[0].y = __int_as_float(0);
 	}
 	else {
 		/* triangle */
-		data[0].x = __int_as_float(leaf->m_lo);
-		data[0].y = __int_as_float(leaf->m_hi);
+		data[0].x = __int_as_float(leaf->lo);
+		data[0].y = __int_as_float(leaf->hi);
 	}
-	data[0].z = __uint_as_float(leaf->m_visibility);
+	data[0].z = __uint_as_float(leaf->visibility);
 	if(leaf->num_triangles() != 0) {
-		data[0].w = __uint_as_float(pack.prim_type[leaf->m_lo]);
+		data[0].w = __uint_as_float(pack.prim_type[leaf->lo]);
 	}
 
 	memcpy(&pack.leaf_nodes[e.idx], data, sizeof(float4)*BVH_QNODE_LEAF_SIZE);
@@ -813,7 +823,7 @@ void QBVH::pack_inner(const BVHStackEntry& e,
 	 */
 	if(params.use_unaligned_nodes) {
 		for(int i = 0; i < num; i++) {
-			if(en[i].node->is_unaligned()) {
+			if(en[i].node->is_unaligned) {
 				has_unaligned = true;
 				break;
 			}
@@ -838,15 +848,15 @@ void QBVH::pack_aligned_inner(const BVHStackEntry& e,
 	BoundBox bounds[4];
 	int child[4];
 	for(int i = 0; i < num; ++i) {
-		bounds[i] = en[i].node->m_bounds;
+		bounds[i] = en[i].node->bounds;
 		child[i] = en[i].encodeIdx();
 	}
 	pack_aligned_node(e.idx,
 	                  bounds,
 	                  child,
-	                  e.node->m_visibility,
-	                  e.node->m_time_from,
-	                  e.node->m_time_to,
+	                  e.node->visibility,
+	                  e.node->time_from,
+	                  e.node->time_to,
 	                  num);
 }
 
@@ -907,16 +917,16 @@ void QBVH::pack_unaligned_inner(const BVHStackEntry& e,
 	int child[4];
 	for(int i = 0; i < num; ++i) {
 		aligned_space[i] = en[i].node->get_aligned_space();
-		bounds[i] = en[i].node->m_bounds;
+		bounds[i] = en[i].node->bounds;
 		child[i] = en[i].encodeIdx();
 	}
 	pack_unaligned_node(e.idx,
 	                    aligned_space,
 	                    bounds,
 	                    child,
-	                    e.node->m_visibility,
-	                    e.node->m_time_from,
-	                    e.node->m_time_to,
+	                    e.node->visibility,
+	                    e.node->time_from,
+	                    e.node->time_to,
 	                    num);
 }
 
diff --git a/intern/cycles/bvh/bvh.h b/intern/cycles/bvh/bvh.h
index 35f4d305883..60bc62ee6e4 100644
--- a/intern/cycles/bvh/bvh.h
+++ b/intern/cycles/bvh/bvh.h
@@ -18,10 +18,10 @@
 #ifndef __BVH_H__
 #define __BVH_H__
 
-#include "bvh_params.h"
+#include "bvh/bvh_params.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -68,6 +68,8 @@ struct PackedBVH {
 	array<int> prim_index;
 	/* mapping from BVH primitive index, to the object id of that primitive. */
 	array<int> prim_object;
+	/* Time range of BVH primitive. */
+	array<float2> prim_time;
 
 	/* index of the root node. */
 	int root_index;
@@ -108,15 +110,15 @@ protected:
 	virtual void refit_nodes() = 0;
 };
 
-/* Regular BVH
+/* Binary BVH
  *
  * Typical BVH with each node having two children. */
 
-class RegularBVH : public BVH {
+class BinaryBVH : public BVH {
 protected:
 	/* constructor */
 	friend class BVH;
-	RegularBVH(const BVHParams& params, const vector<Object*>& objects);
+	BinaryBVH(const BVHParams& params, const vector<Object*>& objects);
 
 	/* pack */
 	void pack_nodes(const BVHNode *root);
diff --git a/intern/cycles/bvh/bvh_binning.cpp b/intern/cycles/bvh/bvh_binning.cpp
index 5ddd7349f7b..3226008f511 100644
--- a/intern/cycles/bvh/bvh_binning.cpp
+++ b/intern/cycles/bvh/bvh_binning.cpp
@@ -19,11 +19,11 @@
 
 #include <stdlib.h>
 
-#include "bvh_binning.h"
+#include "bvh/bvh_binning.h"
 
-#include "util_algorithm.h"
-#include "util_boundbox.h"
-#include "util_types.h"
+#include "util/util_algorithm.h"
+#include "util/util_boundbox.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_binning.h b/intern/cycles/bvh/bvh_binning.h
index 52955f70151..285f9c56a62 100644
--- a/intern/cycles/bvh/bvh_binning.h
+++ b/intern/cycles/bvh/bvh_binning.h
@@ -18,10 +18,10 @@
 #ifndef __BVH_BINNING_H__
 #define __BVH_BINNING_H__
 
-#include "bvh_params.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh_params.h"
+#include "bvh/bvh_unaligned.h"
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index a2f8b33cb0b..95c71b54da0 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -15,26 +15,26 @@
  * limitations under the License.
  */
 
-#include "bvh_binning.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
-#include "bvh_params.h"
+#include "bvh/bvh_binning.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
+#include "bvh/bvh_params.h"
 #include "bvh_split.h"
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "curves.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_stack_allocator.h"
-#include "util_simd.h"
-#include "util_time.h"
-#include "util_queue.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/curves.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_stack_allocator.h"
+#include "util/util_simd.h"
+#include "util/util_time.h"
+#include "util/util_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -93,12 +93,14 @@ BVHBuild::BVHBuild(const vector<Object*>& objects_,
                    array<int>& prim_type_,
                    array<int>& prim_index_,
                    array<int>& prim_object_,
+                   array<float2>& prim_time_,
                    const BVHParams& params_,
                    Progress& progress_)
  : objects(objects_),
    prim_type(prim_type_),
    prim_index(prim_index_),
    prim_object(prim_object_),
+   prim_time(prim_time_),
    params(params_),
    progress(progress_),
    progress_start_time(0.0),
@@ -465,6 +467,9 @@ BVHNode* BVHBuild::run()
 	}
 	spatial_free_index = 0;
 
+	need_prim_time = params.num_motion_curve_steps > 0 ||
+	                 params.num_motion_triangle_steps > 0;
+
 	/* init progress updates */
 	double build_start_time;
 	build_start_time = progress_start_time = time_dt();
@@ -475,6 +480,12 @@ BVHNode* BVHBuild::run()
 	prim_type.resize(references.size());
 	prim_index.resize(references.size());
 	prim_object.resize(references.size());
+	if(need_prim_time) {
+		prim_time.resize(references.size());
+	}
+	else {
+		prim_time.resize(0);
+	}
 
 	/* build recursively */
 	BVHNode *rootnode;
@@ -849,11 +860,14 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		prim_type[start] = ref->prim_type();
 		prim_index[start] = ref->prim_index();
 		prim_object[start] = ref->prim_object();
+		if(need_prim_time) {
+			prim_time[start] = make_float2(ref->time_from(), ref->time_to());
+		}
 
 		uint visibility = objects[ref->prim_object()]->visibility;
 		BVHNode *leaf_node =  new LeafNode(ref->bounds(), visibility, start, start+1);
-		leaf_node->m_time_from = ref->time_from();
-		leaf_node->m_time_to = ref->time_to();
+		leaf_node->time_from = ref->time_from();
+		leaf_node->time_to = ref->time_to();
 		return leaf_node;
 	}
 	else {
@@ -862,12 +876,12 @@ BVHNode *BVHBuild::create_object_leaf_nodes(const BVHReference *ref, int start,
 		BVHNode *leaf1 = create_object_leaf_nodes(ref+mid, start+mid, num-mid);
 
 		BoundBox bounds = BoundBox::empty;
-		bounds.grow(leaf0->m_bounds);
-		bounds.grow(leaf1->m_bounds);
+		bounds.grow(leaf0->bounds);
+		bounds.grow(leaf1->bounds);
 
 		BVHNode *inner_node = new InnerNode(bounds, leaf0, leaf1);
-		inner_node->m_time_from = min(leaf0->m_time_from, leaf1->m_time_from);
-		inner_node->m_time_to = max(leaf0->m_time_to, leaf1->m_time_to);
+		inner_node->time_from = min(leaf0->time_from, leaf1->time_from);
+		inner_node->time_to = max(leaf0->time_to, leaf1->time_to);
 		return inner_node;
 	}
 }
@@ -891,11 +905,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 *    can not control.
 	 */
 	typedef StackAllocator<256, int> LeafStackAllocator;
+	typedef StackAllocator<256, float2> LeafTimeStackAllocator;
 	typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
 
 	vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
 	vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
+	vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
 	vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
 
 	/* TODO(sergey): In theory we should be able to store references. */
@@ -918,6 +934,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			p_type[type_index].push_back(ref.prim_type());
 			p_index[type_index].push_back(ref.prim_index());
 			p_object[type_index].push_back(ref.prim_object());
+			p_time[type_index].push_back(make_float2(ref.time_from(),
+			                                         ref.time_to()));
 
 			bounds[type_index].grow(ref.bounds());
 			visibility[type_index] |= objects[ref.prim_object()]->visibility;
@@ -947,9 +965,13 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	vector<int, LeafStackAllocator> local_prim_type,
 	                                local_prim_index,
 	                                local_prim_object;
+	vector<float2, LeafTimeStackAllocator> local_prim_time;
 	local_prim_type.resize(num_new_prims);
 	local_prim_index.resize(num_new_prims);
 	local_prim_object.resize(num_new_prims);
+	if(need_prim_time) {
+		local_prim_time.resize(num_new_prims);
+	}
 	for(int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
 		int num = (int)p_type[i].size();
 		if(num != 0) {
@@ -962,6 +984,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				local_prim_type[index] = p_type[i][j];
 				local_prim_index[index] = p_index[i][j];
 				local_prim_object[index] = p_object[i][j];
+				if(need_prim_time) {
+					local_prim_time[index] = p_time[i][j];
+				}
 				if(params.use_unaligned_nodes && !alignment_found) {
 					alignment_found =
 						unaligned_heuristic.compute_aligned_space(p_ref[i][j],
@@ -979,19 +1004,19 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 					time_from = min(time_from, ref.time_from());
 					time_to = max(time_to, ref.time_to());
 				}
-				leaf_node->m_time_from = time_from;
-				leaf_node->m_time_to = time_to;
+				leaf_node->time_from = time_from;
+				leaf_node->time_to = time_to;
 			}
 			if(alignment_found) {
 				/* Need to recalculate leaf bounds with new alignment. */
-				leaf_node->m_bounds = BoundBox::empty;
+				leaf_node->bounds = BoundBox::empty;
 				for(int j = 0; j < num; ++j) {
 					const BVHReference &ref = p_ref[i][j];
 					BoundBox ref_bounds =
 					        unaligned_heuristic.compute_aligned_prim_boundbox(
 					                ref,
 					                aligned_space);
-					leaf_node->m_bounds.grow(ref_bounds);
+					leaf_node->bounds.grow(ref_bounds);
 				}
 				/* Set alignment space. */
 				leaf_node->set_aligned_space(aligned_space);
@@ -1028,11 +1053,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 				prim_type.reserve(reserve);
 				prim_index.reserve(reserve);
 				prim_object.reserve(reserve);
+				if(need_prim_time) {
+					prim_time.reserve(reserve);
+				}
 			}
 
 			prim_type.resize(range_end);
 			prim_index.resize(range_end);
 			prim_object.resize(range_end);
+			if(need_prim_time) {
+				prim_time.resize(range_end);
+			}
 		}
 		spatial_spin_lock.unlock();
 
@@ -1041,6 +1072,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
+			if(need_prim_time) {
+				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
+			}
 		}
 	}
 	else {
@@ -1053,6 +1087,9 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 			memcpy(&prim_type[start_index], &local_prim_type[0], new_leaf_data_size);
 			memcpy(&prim_index[start_index], &local_prim_index[0], new_leaf_data_size);
 			memcpy(&prim_object[start_index], &local_prim_object[0], new_leaf_data_size);
+			if(need_prim_time) {
+				memcpy(&prim_time[start_index], &local_prim_time[0], sizeof(float2)*num_new_leaf_data);
+			}
 		}
 	}
 
@@ -1062,8 +1099,8 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 	 */
 	for(int i = 0; i < num_leaves; ++i) {
 		LeafNode *leaf = (LeafNode *)leaves[i];
-		leaf->m_lo += start_index;
-		leaf->m_hi += start_index;
+		leaf->lo += start_index;
+		leaf->hi += start_index;
 	}
 
 	/* Create leaf node for object. */
@@ -1092,17 +1129,17 @@ BVHNode* BVHBuild::create_leaf_node(const BVHRange& range,
 		return new InnerNode(range.bounds(), leaves[0], leaves[1]);
 	}
 	else if(num_leaves == 3) {
-		BoundBox inner_bounds = merge(leaves[1]->m_bounds, leaves[2]->m_bounds);
+		BoundBox inner_bounds = merge(leaves[1]->bounds, leaves[2]->bounds);
 		BVHNode *inner = new InnerNode(inner_bounds, leaves[1], leaves[2]);
 		return new InnerNode(range.bounds(), leaves[0], inner);
 	} else {
 		/* Should be doing more branches if more primitive types added. */
 		assert(num_leaves <= 5);
-		BoundBox inner_bounds_a = merge(leaves[0]->m_bounds, leaves[1]->m_bounds);
-		BoundBox inner_bounds_b = merge(leaves[2]->m_bounds, leaves[3]->m_bounds);
+		BoundBox inner_bounds_a = merge(leaves[0]->bounds, leaves[1]->bounds);
+		BoundBox inner_bounds_b = merge(leaves[2]->bounds, leaves[3]->bounds);
 		BVHNode *inner_a = new InnerNode(inner_bounds_a, leaves[0], leaves[1]);
 		BVHNode *inner_b = new InnerNode(inner_bounds_b, leaves[2], leaves[3]);
-		BoundBox inner_bounds_c = merge(inner_a->m_bounds, inner_b->m_bounds);
+		BoundBox inner_bounds_c = merge(inner_a->bounds, inner_b->bounds);
 		BVHNode *inner_c = new InnerNode(inner_bounds_c, inner_a, inner_b);
 		if(num_leaves == 5) {
 			return new InnerNode(range.bounds(), inner_c, leaves[4]);
@@ -1137,8 +1174,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 		rotate(parent->children[c], max_depth-1);
 
 	/* compute current area of all children */
-	BoundBox bounds0 = parent->children[0]->m_bounds;
-	BoundBox bounds1 = parent->children[1]->m_bounds;
+	BoundBox bounds0 = parent->children[0]->bounds;
+	BoundBox bounds1 = parent->children[1]->bounds;
 
 	float area0 = bounds0.half_area();
 	float area1 = bounds1.half_area();
@@ -1158,8 +1195,8 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 		BoundBox& other = (c == 0)? bounds1: bounds0;
 
 		/* transpose child bounds */
-		BoundBox target0 = child->children[0]->m_bounds;
-		BoundBox target1 = child->children[1]->m_bounds;
+		BoundBox target0 = child->children[0]->bounds;
+		BoundBox target1 = child->children[1]->bounds;
 
 		/* compute cost for both possible swaps */
 		float cost0 = merge(other, target1).half_area() - child_area[c];
@@ -1191,7 +1228,7 @@ void BVHBuild::rotate(BVHNode *node, int max_depth)
 	InnerNode *child = (InnerNode*)parent->children[best_child];
 
 	swap(parent->children[best_other], child->children[best_target]);
-	child->m_bounds = merge(child->children[0]->m_bounds, child->children[1]->m_bounds);
+	child->bounds = merge(child->children[0]->bounds, child->children[1]->bounds);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_build.h b/intern/cycles/bvh/bvh_build.h
index ee3cde66a2f..5733708050d 100644
--- a/intern/cycles/bvh/bvh_build.h
+++ b/intern/cycles/bvh/bvh_build.h
@@ -20,13 +20,13 @@
 
 #include <float.h>
 
-#include "bvh.h"
-#include "bvh_binning.h"
-#include "bvh_unaligned.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_binning.h"
+#include "bvh/bvh_unaligned.h"
 
-#include "util_boundbox.h"
-#include "util_task.h"
-#include "util_vector.h"
+#include "util/util_boundbox.h"
+#include "util/util_task.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,6 +48,7 @@ public:
 	         array<int>& prim_type,
 	         array<int>& prim_index,
 	         array<int>& prim_object,
+	         array<float2>& prim_time,
 	         const BVHParams& params,
 	         Progress& progress);
 	~BVHBuild();
@@ -112,6 +113,9 @@ protected:
 	array<int>& prim_type;
 	array<int>& prim_index;
 	array<int>& prim_object;
+	array<float2>& prim_time;
+
+	bool need_prim_time;
 
 	/* Build parameters. */
 	BVHParams params;
diff --git a/intern/cycles/bvh/bvh_node.cpp b/intern/cycles/bvh/bvh_node.cpp
index 67580e1bc7b..4f788c66797 100644
--- a/intern/cycles/bvh/bvh_node.cpp
+++ b/intern/cycles/bvh/bvh_node.cpp
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 
-#include "bvh.h"
-#include "bvh_build.h"
-#include "bvh_node.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_node.h"
 
-#include "util_debug.h"
-#include "util_vector.h"
+#include "util/util_debug.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,12 +62,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			}
 			return cnt;
 		case BVH_STAT_ALIGNED_COUNT:
-			if(!is_unaligned()) {
+			if(!is_unaligned) {
 				cnt = 1;
 			}
 			break;
 		case BVH_STAT_UNALIGNED_COUNT:
-			if(is_unaligned()) {
+			if(is_unaligned) {
 				cnt = 1;
 			}
 			break;
@@ -75,7 +75,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			if(!is_leaf()) {
 				bool has_unaligned = false;
 				for(int j = 0; j < num_children(); j++) {
-					has_unaligned |= get_child(j)->is_unaligned();
+					has_unaligned |= get_child(j)->is_unaligned;
 				}
 				cnt += has_unaligned? 0: 1;
 			}
@@ -84,7 +84,7 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			if(!is_leaf()) {
 				bool has_unaligned = false;
 				for(int j = 0; j < num_children(); j++) {
-					has_unaligned |= get_child(j)->is_unaligned();
+					has_unaligned |= get_child(j)->is_unaligned;
 				}
 				cnt += has_unaligned? 1: 0;
 			}
@@ -95,12 +95,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				for(int i = 0; i < num_children(); i++) {
 					BVHNode *node = get_child(i);
 					if(node->is_leaf()) {
-						has_unaligned |= node->is_unaligned();
+						has_unaligned |= node->is_unaligned;
 					}
 					else {
 						for(int j = 0; j < node->num_children(); j++) {
 							cnt += node->get_child(j)->getSubtreeSize(stat);
-							has_unaligned |= node->get_child(j)->is_unaligned();
+							has_unaligned |= node->get_child(j)->is_unaligned;
 						}
 					}
 				}
@@ -113,12 +113,12 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 				for(int i = 0; i < num_children(); i++) {
 					BVHNode *node = get_child(i);
 					if(node->is_leaf()) {
-						has_unaligned |= node->is_unaligned();
+						has_unaligned |= node->is_unaligned;
 					}
 					else {
 						for(int j = 0; j < node->num_children(); j++) {
 							cnt += node->get_child(j)->getSubtreeSize(stat);
-							has_unaligned |= node->get_child(j)->is_unaligned();
+							has_unaligned |= node->get_child(j)->is_unaligned;
 						}
 					}
 				}
@@ -126,10 +126,10 @@ int BVHNode::getSubtreeSize(BVH_STAT stat) const
 			}
 			return cnt;
 		case BVH_STAT_ALIGNED_LEAF_COUNT:
-			cnt = (is_leaf() && !is_unaligned()) ? 1 : 0;
+			cnt = (is_leaf() && !is_unaligned) ? 1 : 0;
 			break;
 		case BVH_STAT_UNALIGNED_LEAF_COUNT:
-			cnt = (is_leaf() && is_unaligned()) ? 1 : 0;
+			cnt = (is_leaf() && is_unaligned) ? 1 : 0;
 			break;
 		default:
 			assert(0); /* unknown mode */
@@ -157,7 +157,7 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons
 
 	for(int i = 0; i < num_children(); i++) {
 		BVHNode *child = get_child(i);
-		SAH += child->computeSubtreeSAHCost(p, probability * child->m_bounds.safe_area()/m_bounds.safe_area());
+		SAH += child->computeSubtreeSAHCost(p, probability * child->bounds.safe_area()/bounds.safe_area());
 	}
 
 	return SAH;
@@ -165,15 +165,15 @@ float BVHNode::computeSubtreeSAHCost(const BVHParams& p, float probability) cons
 
 uint BVHNode::update_visibility()
 {
-	if(!is_leaf() && m_visibility == 0) {
+	if(!is_leaf() && visibility == 0) {
 		InnerNode *inner = (InnerNode*)this;
 		BVHNode *child0 = inner->children[0];
 		BVHNode *child1 = inner->children[1];
 
-		m_visibility = child0->update_visibility()|child1->update_visibility();
+		visibility = child0->update_visibility()|child1->update_visibility();
 	}
 
-	return m_visibility;
+	return visibility;
 }
 
 void BVHNode::update_time()
@@ -184,8 +184,8 @@ void BVHNode::update_time()
 		BVHNode *child1 = inner->children[1];
 		child0->update_time();
 		child1->update_time();
-		m_time_from = min(child0->m_time_from, child1->m_time_from);
-		m_time_to =  max(child0->m_time_to, child1->m_time_to);
+		time_from = min(child0->time_from, child1->time_from);
+		time_to =  max(child0->time_to, child1->time_to);
 	}
 }
 
@@ -209,7 +209,7 @@ void LeafNode::print(int depth) const
 	for(int i = 0; i < depth; i++)
 		printf("  ");
 	
-	printf("leaf node %d to %d\n", m_lo, m_hi);
+	printf("leaf node %d to %d\n", lo, hi);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/bvh/bvh_node.h b/intern/cycles/bvh/bvh_node.h
index 090c426de56..60511b4b012 100644
--- a/intern/cycles/bvh/bvh_node.h
+++ b/intern/cycles/bvh/bvh_node.h
@@ -18,9 +18,9 @@
 #ifndef __BVH_NODE_H__
 #define __BVH_NODE_H__
 
-#include "util_boundbox.h"
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -46,16 +46,16 @@ class BVHParams;
 class BVHNode
 {
 public:
-	BVHNode() : m_is_unaligned(false),
-	            m_aligned_space(NULL),
-	            m_time_from(0.0f),
-	            m_time_to(1.0f)
+	BVHNode() : is_unaligned(false),
+	            aligned_space(NULL),
+	            time_from(0.0f),
+	            time_to(1.0f)
 	{
 	}
 
 	virtual ~BVHNode()
 	{
-		delete m_aligned_space;
+		delete aligned_space;
 	}
 
 	virtual bool is_leaf() const = 0;
@@ -63,30 +63,26 @@ public:
 	virtual BVHNode *get_child(int i) const = 0;
 	virtual int num_triangles() const { return 0; }
 	virtual void print(int depth = 0) const = 0;
-	bool is_unaligned() const { return m_is_unaligned; }
 
 	inline void set_aligned_space(const Transform& aligned_space)
 	{
-		m_is_unaligned = true;
-		if(m_aligned_space == NULL) {
-			m_aligned_space = new Transform(aligned_space);
+		is_unaligned = true;
+		if(this->aligned_space == NULL) {
+			this->aligned_space = new Transform(aligned_space);
 		}
 		else {
-			*m_aligned_space = aligned_space;
+			*this->aligned_space = aligned_space;
 		}
 	}
 
 	inline Transform get_aligned_space() const
 	{
-		if(m_aligned_space == NULL) {
+		if(aligned_space == NULL) {
 			return transform_identity();
 		}
-		return *m_aligned_space;
+		return *aligned_space;
 	}
 
-	BoundBox m_bounds;
-	uint m_visibility;
-
 	// Subtree functions
 	int getSubtreeSize(BVH_STAT stat=BVH_STAT_NODE_COUNT) const;
 	float computeSubtreeSAHCost(const BVHParams& p, float probability = 1.0f) const;
@@ -95,13 +91,18 @@ public:
 	uint update_visibility();
 	void update_time();
 
-	bool m_is_unaligned;
+	// Properties.
+	BoundBox bounds;
+	uint visibility;
+
+	bool is_unaligned;
 
-	// TODO(sergey): Can be stored as 3x3 matrix, but better to have some
-	// utilities and type defines in util_transform first.
-	Transform *m_aligned_space;
+	/* TODO(sergey): Can be stored as 3x3 matrix, but better to have some
+	 * utilities and type defines in util_transform first.
+	 */
+	Transform *aligned_space;
 
-	float m_time_from, m_time_to;
+	float time_from, time_to;
 };
 
 class InnerNode : public BVHNode
@@ -111,20 +112,20 @@ public:
 	          BVHNode* child0,
 	          BVHNode* child1)
 	{
-		m_bounds = bounds;
+		this->bounds = bounds;
 		children[0] = child0;
 		children[1] = child1;
 
 		if(child0 && child1)
-			m_visibility = child0->m_visibility|child1->m_visibility;
+			visibility = child0->visibility|child1->visibility;
 		else
-			m_visibility = 0; /* happens on build cancel */
+			visibility = 0; /* happens on build cancel */
 	}
 
 	explicit InnerNode(const BoundBox& bounds)
 	{
-		m_bounds = bounds;
-		m_visibility = 0;
+		this->bounds = bounds;
+		visibility = 0;
 		children[0] = NULL;
 		children[1] = NULL;
 	}
@@ -140,12 +141,12 @@ public:
 class LeafNode : public BVHNode
 {
 public:
-	LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi) 
+	LeafNode(const BoundBox& bounds, uint visibility, int lo, int hi)
+	: lo(lo),
+	  hi(hi)
 	{
-		m_bounds = bounds;
-		m_visibility = visibility;
-		m_lo = lo;
-		m_hi = hi;
+		this->bounds = bounds;
+		this->visibility = visibility;
 	}
 
 	LeafNode(const LeafNode& s)
@@ -157,14 +158,13 @@ public:
 	bool is_leaf() const { return true; }
 	int num_children() const { return 0; }
 	BVHNode *get_child(int) const { return NULL; }
-	int num_triangles() const { return m_hi - m_lo; }
+	int num_triangles() const { return hi - lo; }
 	void print(int depth) const;
 
-	int m_lo;
-	int m_hi;
+	int lo;
+	int hi;
 };
 
 CCL_NAMESPACE_END
 
 #endif /* __BVH_NODE_H__ */
-
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 65f9da1c194..9795a7a4350 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -18,9 +18,9 @@
 #ifndef __BVH_PARAMS_H__
 #define __BVH_PARAMS_H__
 
-#include "util_boundbox.h"
+#include "util/util_boundbox.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -104,6 +104,7 @@ public:
 		primitive_mask = PRIMITIVE_ALL;
 
 		num_motion_curve_steps = 0;
+		num_motion_triangle_steps = 0;
 	}
 
 	/* SAH costs */
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index e5bcf9995bf..d29629c0279 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 
-#include "bvh_build.h"
-#include "bvh_sort.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_sort.h"
 
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_task.h"
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_split.cpp b/intern/cycles/bvh/bvh_split.cpp
index d0d5fbe5a7a..b10d69a495d 100644
--- a/intern/cycles/bvh/bvh_split.cpp
+++ b/intern/cycles/bvh/bvh_split.cpp
@@ -15,14 +15,14 @@
  * limitations under the License.
  */
 
-#include "bvh_build.h"
-#include "bvh_split.h"
-#include "bvh_sort.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_split.h"
+#include "bvh/bvh_sort.h"
 
-#include "mesh.h"
-#include "object.h"
+#include "render/mesh.h"
+#include "render/object.h"
 
-#include "util_algorithm.h"
+#include "util/util_algorithm.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_split.h b/intern/cycles/bvh/bvh_split.h
index dbdb51f1a5b..a874a118b99 100644
--- a/intern/cycles/bvh/bvh_split.h
+++ b/intern/cycles/bvh/bvh_split.h
@@ -18,8 +18,8 @@
 #ifndef __BVH_SPLIT_H__
 #define __BVH_SPLIT_H__
 
-#include "bvh_build.h"
-#include "bvh_params.h"
+#include "bvh/bvh_build.h"
+#include "bvh/bvh_params.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_unaligned.cpp b/intern/cycles/bvh/bvh_unaligned.cpp
index a876c670914..ef227d20ea9 100644
--- a/intern/cycles/bvh/bvh_unaligned.cpp
+++ b/intern/cycles/bvh/bvh_unaligned.cpp
@@ -15,17 +15,17 @@
  */
 
 
-#include "bvh_unaligned.h"
+#include "bvh/bvh_unaligned.h"
 
-#include "mesh.h"
-#include "object.h"
+#include "render/mesh.h"
+#include "render/object.h"
 
-#include "bvh_binning.h"
+#include "bvh/bvh_binning.h"
 #include "bvh_params.h"
 
-#include "util_boundbox.h"
-#include "util_debug.h"
-#include "util_transform.h"
+#include "util/util_boundbox.h"
+#include "util/util_debug.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/bvh/bvh_unaligned.h b/intern/cycles/bvh/bvh_unaligned.h
index 4d0872f4a39..f41bae79e2b 100644
--- a/intern/cycles/bvh/bvh_unaligned.h
+++ b/intern/cycles/bvh/bvh_unaligned.h
@@ -17,7 +17,7 @@
 #ifndef __BVH_UNALIGNED_H__
 #define __BVH_UNALIGNED_H__
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 966ff5e52ba..6ef2aa1caad 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../kernel/osl
-	../util
-	../render
+	..
 	../../glew-mx
 )
 
@@ -33,6 +27,7 @@ set(SRC
 	device_cuda.cpp
 	device_multi.cpp
 	device_opencl.cpp
+	device_split_kernel.cpp
 	device_task.cpp
 )
 
@@ -56,6 +51,7 @@ set(SRC_HEADERS
 	device_memory.h
 	device_intern.h
 	device_network.h
+	device_split_kernel.h
 	device_task.h
 )
 
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 31c99f49d6d..968af447e29 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -17,18 +17,18 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "device.h"
-#include "device_intern.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_half.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_time.h"
-#include "util_types.h"
-#include "util_vector.h"
-#include "util_string.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_half.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_time.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,11 +48,11 @@ std::ostream& operator <<(std::ostream &os,
 	os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
 	/* TODO(sergey): Decode bitflag into list of names. */
 	os << "Nodes features: " << requested_features.nodes_features << std::endl;
-	os << "Use hair: "
+	os << "Use Hair: "
 	   << string_from_bool(requested_features.use_hair) << std::endl;
-	os << "Use object motion: "
+	os << "Use Object Motion: "
 	   << string_from_bool(requested_features.use_object_motion) << std::endl;
-	os << "Use camera motion: "
+	os << "Use Camera Motion: "
 	   << string_from_bool(requested_features.use_camera_motion) << std::endl;
 	os << "Use Baking: "
 	   << string_from_bool(requested_features.use_baking) << std::endl;
@@ -80,7 +80,7 @@ Device::~Device()
 
 void Device::pixels_alloc(device_memory& mem)
 {
-	mem_alloc(mem, MEM_READ_WRITE);
+	mem_alloc("pixels", mem, MEM_READ_WRITE);
 }
 
 void Device::pixels_copy_from(device_memory& mem, int y, int w, int h)
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ccee25ae34e..ac06e561795 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -19,15 +19,15 @@
 
 #include <stdlib.h>
 
-#include "device_memory.h"
-#include "device_task.h"
+#include "device/device_memory.h"
+#include "device/device_task.h"
 
-#include "util_list.h"
-#include "util_stats.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_stats.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -121,6 +121,9 @@ public:
 	/* Use Transparent shadows */
 	bool use_transparent;
 
+	/* Use various shadow tricks, such as shadow catcher. */
+	bool use_shadow_tricks;
+
 	DeviceRequestedFeatures()
 	{
 		/* TODO(sergey): Find more meaningful defaults. */
@@ -137,6 +140,7 @@ public:
 		use_integrator_branched = false;
 		use_patch_evaluation = false;
 		use_transparent = false;
+		use_shadow_tricks = false;
 	}
 
 	bool modified(const DeviceRequestedFeatures& requested_features)
@@ -153,7 +157,8 @@ public:
 		         use_volume == requested_features.use_volume &&
 		         use_integrator_branched == requested_features.use_integrator_branched &&
 		         use_patch_evaluation == requested_features.use_patch_evaluation &&
-		         use_transparent == requested_features.use_transparent);
+		         use_transparent == requested_features.use_transparent &&
+		         use_shadow_tricks == requested_features.use_shadow_tricks);
 	}
 
 	/* Convert the requested features structure to a build options,
@@ -194,9 +199,12 @@ public:
 		if(!use_patch_evaluation) {
 			build_options += " -D__NO_PATCH_EVAL__";
 		}
-		if(!use_transparent) {
+		if(!use_transparent && !use_volume) {
 			build_options += " -D__NO_TRANSPARENT__";
 		}
+		if(!use_shadow_tricks) {
+			build_options += " -D__NO_SHADOW_TRICKS__";
+		}
 		return build_options;
 	}
 };
@@ -228,13 +236,21 @@ public:
 	DeviceInfo info;
 	virtual const string& error_message() { return error_msg; }
 	bool have_error() { return !error_message().empty(); }
+	virtual void set_error(const string& error)
+	{
+		if(!have_error()) {
+			error_msg = error;
+		}
+		fprintf(stderr, "%s\n", error.c_str());
+		fflush(stderr);
+	}
 	virtual bool show_samples() const { return false; }
 
 	/* statistics */
 	Stats &stats;
 
 	/* regular memory */
-	virtual void mem_alloc(device_memory& mem, MemoryType type) = 0;
+	virtual void mem_alloc(const char *name, device_memory& mem, MemoryType type) = 0;
 	virtual void mem_copy_to(device_memory& mem) = 0;
 	virtual void mem_copy_from(device_memory& mem,
 		int y, int w, int h, int elem) = 0;
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index c8e001ec2fd..2761d9488ca 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -20,36 +20,124 @@
 /* So ImathMath is included before our kernel_cpu_compat. */
 #ifdef WITH_OSL
 /* So no context pollution happens from indirectly included windows.h */
-#  include "util_windows.h"
+#  include "util/util_windows.h"
 #  include <OSL/oslexec.h>
 #endif
 
-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"
 
-#include "kernel.h"
-#include "kernel_compat_cpu.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "kernel/kernel.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data.h"
+#include "kernel/kernel_globals.h"
 
-#include "osl_shader.h"
-#include "osl_globals.h"
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_opengl.h"
-#include "util_progress.h"
-#include "util_system.h"
-#include "util_thread.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
+class CPUDevice;
+
+class CPUSplitKernel : public DeviceSplitKernel {
+	CPUDevice *device;
+public:
+	explicit CPUSplitKernel(CPUDevice *device);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+};
+
 class CPUDevice : public Device
 {
+	static unordered_map<string, void*> kernel_functions;
+
+	static void register_kernel_function(const char* name, void* func)
+	{
+		kernel_functions[name] = func;
+	}
+
+	static const char* get_arch_name()
+	{
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		if(system_cpu_support_avx2()) {
+			return "cpu_avx2";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		if(system_cpu_support_avx()) {
+			return "cpu_avx";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		if(system_cpu_support_sse41()) {
+			return "cpu_sse41";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+		if(system_cpu_support_sse3()) {
+			return "cpu_sse3";
+		}
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		if(system_cpu_support_sse2()) {
+			return "cpu_sse2";
+		}
+		else
+#endif
+		{
+			return "cpu";
+		}
+	}
+
+	template<typename F>
+	static F get_kernel_function(string name)
+	{
+		name = string("kernel_") + get_arch_name() + "_" + name;
+
+		unordered_map<string, void*>::iterator it = kernel_functions.find(name);
+
+		if(it == kernel_functions.end()) {
+			assert(!"kernel function not found");
+			return NULL;
+		}
+
+		return (F)it->second;
+	}
+
+	friend class CPUSplitKernel;
+
 public:
 	TaskPool task_pool;
 	KernelGlobals kernel_globals;
@@ -57,10 +145,15 @@ public:
 #ifdef WITH_OSL
 	OSLGlobals osl_globals;
 #endif
+
+	bool use_split_kernel;
+
+	DeviceRequestedFeatures requested_features;
 	
 	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
 	: Device(info, stats, background)
 	{
+
 #ifdef WITH_OSL
 		kernel_globals.osl = &osl_globals;
 #endif
@@ -105,6 +198,28 @@ public:
 		{
 			VLOG(1) << "Will be using regular kernels.";
 		}
+
+		use_split_kernel = DebugFlags().cpu.split_kernel;
+		if(use_split_kernel) {
+			VLOG(1) << "Will be using split kernel.";
+		}
+
+		kernel_cpu_register_functions(register_kernel_function);
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		kernel_cpu_sse2_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+		kernel_cpu_sse3_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+		kernel_cpu_sse41_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+		kernel_cpu_avx_register_functions(register_kernel_function);
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+		kernel_cpu_avx2_register_functions(register_kernel_function);
+#endif
 	}
 
 	~CPUDevice()
@@ -117,9 +232,20 @@ public:
 		return (TaskScheduler::num_threads() == 1);
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		mem.device_pointer = mem.data_pointer;
+
+		if(!mem.device_pointer) {
+			mem.device_pointer = (device_ptr)malloc(mem.memory_size());
+		}
+
 		mem.device_size = mem.memory_size();
 		stats.mem_alloc(mem.device_size);
 	}
@@ -144,6 +270,10 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		if(mem.device_pointer) {
+			if(!mem.data_pointer) {
+				free((void*)mem.device_pointer);
+			}
+
 			mem.device_pointer = 0;
 			stats.mem_free(mem.device_size);
 			mem.device_size = 0;
@@ -196,8 +326,14 @@ public:
 
 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE)
-			thread_path_trace(*task);
+		if(task->type == DeviceTask::PATH_TRACE) {
+			if(!use_split_kernel) {
+				thread_path_trace(*task);
+			}
+			else {
+				thread_path_trace_split(*task);
+			}
+		}
 		else if(task->type == DeviceTask::FILM_CONVERT)
 			thread_film_convert(*task);
 		else if(task->type == DeviceTask::SHADER)
@@ -258,7 +394,7 @@ public:
 		{
 			path_trace_kernel = kernel_cpu_path_trace;
 		}
-		
+
 		while(task.acquire_tile(this, tile)) {
 			float *render_buffer = (float*)tile.buffer;
 			uint *rng_state = (uint*)tile.rng_state;
@@ -294,6 +430,49 @@ public:
 		thread_kernel_globals_free(&kg);
 	}
 
+	void thread_path_trace_split(DeviceTask& task)
+	{
+		if(task_pool.canceled()) {
+			if(task.need_finish_queue == false)
+				return;
+		}
+
+		RenderTile tile;
+
+		CPUSplitKernel split_kernel(this);
+
+		/* allocate buffer for kernel globals */
+		device_memory kgbuffer;
+		kgbuffer.resize(sizeof(KernelGlobals));
+		mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
+
+		KernelGlobals *kg = (KernelGlobals*)kgbuffer.device_pointer;
+		*kg = thread_kernel_globals_init();
+
+		requested_features.max_closure = MAX_CLOSURE;
+		if(!split_kernel.load_kernels(requested_features)) {
+			thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+			mem_free(kgbuffer);
+
+			return;
+		}
+
+		while(task.acquire_tile(this, tile)) {
+			device_memory data;
+			split_kernel.path_trace(&task, tile, kgbuffer, data);
+
+			task.release_tile(tile);
+
+			if(task_pool.canceled()) {
+				if(task.need_finish_queue == false)
+					break;
+			}
+		}
+
+		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
+		mem_free(kgbuffer);
+	}
+
 	void thread_film_convert(DeviceTask& task)
 	{
 		float sample_scale = 1.0f/(task.sample + 1);
@@ -501,6 +680,10 @@ protected:
 
 	inline void thread_kernel_globals_free(KernelGlobals *kg)
 	{
+		if(kg == NULL) {
+			return;
+		}
+
 		if(kg->transparent_shadow_intersections != NULL) {
 			free(kg->transparent_shadow_intersections);
 		}
@@ -515,8 +698,175 @@ protected:
 		OSLShader::thread_free(kg);
 #endif
 	}
+
+	virtual bool load_kernels(DeviceRequestedFeatures& requested_features_) {
+		requested_features = requested_features_;
+
+		return true;
+	}
+};
+
+/* split kernel */
+
+class CPUSplitKernelFunction : public SplitKernelFunction {
+public:
+	CPUDevice* device;
+	void (*func)(KernelGlobals *kg, KernelData *data);
+
+	CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
+	~CPUSplitKernelFunction() {}
+
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
+	{
+		if(!func) {
+			return false;
+		}
+
+		KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+		kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+		for(int y = 0; y < dim.global_size[1]; y++) {
+			for(int x = 0; x < dim.global_size[0]; x++) {
+				kg->global_id = make_int2(x, y);
+
+				func(kg, (KernelData*)data.device_pointer);
+			}
+		}
+
+		return true;
+	}
 };
 
+CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                                    RenderTile& rtile,
+                                                    int num_global_elements,
+                                                    device_memory& kernel_globals,
+                                                    device_memory& data,
+                                                    device_memory& split_data,
+                                                    device_memory& ray_state,
+                                                    device_memory& queue_index,
+                                                    device_memory& use_queues_flags,
+                                                    device_memory& work_pool_wgs)
+{
+	typedef void(*data_init_t)(KernelGlobals *kg,
+	                           ccl_constant KernelData *data,
+	                           ccl_global void *split_data_buffer,
+	                           int num_elements,
+	                           ccl_global char *ray_state,
+	                           ccl_global uint *rng_state,
+	                           int start_sample,
+	                           int end_sample,
+	                           int sx, int sy, int sw, int sh, int offset, int stride,
+	                           ccl_global int *Queue_index,
+	                           int queuesize,
+	                           ccl_global char *use_queues_flag,
+	                           ccl_global unsigned int *work_pool_wgs,
+	                           unsigned int num_samples,
+	                           ccl_global float *buffer);
+
+	data_init_t data_init;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+	if(system_cpu_support_avx2()) {
+		data_init = kernel_cpu_avx2_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+	if(system_cpu_support_avx()) {
+		data_init = kernel_cpu_avx_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+	if(system_cpu_support_sse41()) {
+		data_init = kernel_cpu_sse41_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+	if(system_cpu_support_sse3()) {
+		data_init = kernel_cpu_sse3_data_init;
+	}
+	else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+	if(system_cpu_support_sse2()) {
+		data_init = kernel_cpu_sse2_data_init;
+	}
+	else
+#endif
+	{
+		data_init = kernel_cpu_data_init;
+	}
+
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+	kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+	for(int y = 0; y < dim.global_size[1]; y++) {
+		for(int x = 0; x < dim.global_size[0]; x++) {
+			kg->global_id = make_int2(x, y);
+
+			data_init((KernelGlobals*)kernel_globals.device_pointer,
+			          (KernelData*)data.device_pointer,
+			          (void*)split_data.device_pointer,
+			          num_global_elements,
+			          (char*)ray_state.device_pointer,
+			          (uint*)rtile.rng_state,
+			          rtile.start_sample,
+			          rtile.start_sample + rtile.num_samples,
+			          rtile.x,
+			          rtile.y,
+			          rtile.w,
+			          rtile.h,
+			          rtile.offset,
+			          rtile.stride,
+			          (int*)queue_index.device_pointer,
+			          dim.global_size[0] * dim.global_size[1],
+			          (char*)use_queues_flags.device_pointer,
+			          (uint*)work_pool_wgs.device_pointer,
+			          rtile.num_samples,
+			          (float*)rtile.buffer);
+		}
+	}
+
+	return true;
+}
+
+SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
+
+	kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
+	if(!kernel->func) {
+		delete kernel;
+		return NULL;
+	}
+
+	return kernel;
+}
+
+int2 CPUSplitKernel::split_kernel_local_size()
+{
+	return make_int2(1, 1);
+}
+
+int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
+	return make_int2(64, 1);
+}
+
+uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
+	KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+
+	return split_data_buffer_size(kg, num_threads);
+}
+
+unordered_map<string, void*> CPUDevice::kernel_functions;
+
 Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
 {
 	return new CPUDevice(info, stats, background);
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index dafac6dfcb3..606494f08ed 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -15,32 +15,36 @@
  */
 
 #include <climits>
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "device.h"
-#include "device_intern.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_split_kernel.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
 #ifdef WITH_CUDA_DYNLOAD
 #  include "cuew.h"
 #else
-#  include "util_opengl.h"
+#  include "util/util_opengl.h"
 #  include <cuda.h>
 #  include <cudaGL.h>
 #endif
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_md5.h"
-#include "util_opengl.h"
-#include "util_path.h"
-#include "util_string.h"
-#include "util_system.h"
-#include "util_types.h"
-#include "util_time.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_md5.h"
+#include "util/util_opengl.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
+#include "util/util_types.h"
+#include "util/util_time.h"
+
+#include "kernel/split/kernel_split_data_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -78,6 +82,31 @@ int cuewCompilerVersion(void)
 }  /* namespace */
 #endif  /* WITH_CUDA_DYNLOAD */
 
+class CUDADevice;
+
+class CUDASplitKernel : public DeviceSplitKernel {
+	CUDADevice *device;
+public:
+	explicit CUDASplitKernel(CUDADevice *device);
+
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs);
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&);
+	virtual int2 split_kernel_local_size();
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+};
+
 class CUDADevice : public Device
 {
 public:
@@ -258,16 +287,21 @@ public:
 		return DebugFlags().cuda.adaptive_compile;
 	}
 
+	bool use_split_kernel()
+	{
+		return DebugFlags().cuda.split_kernel;
+	}
+
 	/* Common NVCC flags which stays the same regardless of shading model,
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features)
+	        const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
-		const string kernel_path = path_get("kernel");
-		const string include = kernel_path;
+		const string source_path = path_get("source");
+		const string include_path = source_path;
 		string cflags = string_printf("-m%d "
 		                              "--ptxas-options=\"-v\" "
 		                              "--use_fast_math "
@@ -276,7 +310,7 @@ public:
 		                               "-I\"%s\"",
 		                              machine,
 		                              cuda_version,
-		                              include.c_str());
+		                              include_path.c_str());
 		if(use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
 		}
@@ -287,6 +321,11 @@ public:
 #ifdef WITH_CYCLES_DEBUG
 		cflags += " -D__KERNEL_DEBUG__";
 #endif
+
+		if(split) {
+			cflags += " -D__SPLIT__";
+		}
+
 		return cflags;
 	}
 
@@ -306,21 +345,21 @@ public:
 			cuda_error_message("CUDA nvcc compiler version could not be parsed.");
 			return false;
 		}
-		if(cuda_version < 75) {
+		if(cuda_version < 80) {
 			printf("Unsupported CUDA version %d.%d detected, "
-			       "you need CUDA 7.5 or newer.\n",
+			       "you need CUDA 8.0 or newer.\n",
 			       major, minor);
 			return false;
 		}
-		else if(cuda_version != 75 && cuda_version != 80) {
+		else if(cuda_version != 80) {
 			printf("CUDA version %d.%d detected, build may succeed but only "
-			       "CUDA 7.5 and 8.0 are officially supported.\n",
+			       "CUDA 8.0 is officially supported.\n",
 			       major, minor);
 		}
 		return true;
 	}
 
-	string compile_kernel(const DeviceRequestedFeatures& requested_features)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
 	{
 		/* Compute cubin name. */
 		int major, minor;
@@ -329,7 +368,8 @@ public:
 
 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin",
+			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
+			                                                  : "lib/kernel_sm_%d%d.cubin",
 			                                            major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
@@ -339,18 +379,19 @@ public:
 		}
 
 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features);
+		        compile_kernel_get_common_cflags(requested_features, split);
 
 		/* Try to use locally compiled kernel. */
-		const string kernel_path = path_get("kernel");
-		const string kernel_md5 = path_files_md5_hash(kernel_path);
+		const string source_path = path_get("source");
+		const string kernel_md5 = path_files_md5_hash(source_path);
 
 		/* We include cflags into md5 so changing cuda toolkit or changing other
 		 * compiler command line arguments makes sure cubin gets re-built.
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
 
-		const string cubin_file = string_printf("cycles_kernel_sm%d%d_%s.cubin",
+		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
+		                                              : "cycles_kernel_sm%d%d_%s.cubin",
 		                                        major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
@@ -383,9 +424,10 @@ public:
 			return "";
 		}
 		const char *nvcc = cuewCompilerPath();
-		const string kernel = path_join(kernel_path,
-		                          path_join("kernels",
-		                                    path_join("cuda", "kernel.cu")));
+		const string kernel = path_join(
+		        path_join(source_path, "kernel"),
+		        path_join("kernels",
+		                  path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");
 
@@ -433,7 +475,7 @@ public:
 			return false;
 
 		/* get kernel */
-		string cubin = compile_kernel(requested_features);
+		string cubin = compile_kernel(requested_features, use_split_kernel());
 
 		if(cubin == "")
 			return false;
@@ -466,8 +508,14 @@ public:
 		}
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType /*type*/)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType /*type*/)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+			        << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			        << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		cuda_push_context();
 		CUdeviceptr device_pointer;
 		size_t size = mem.memory_size();
@@ -504,7 +552,9 @@ public:
 
 	void mem_zero(device_memory& mem)
 	{
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
+		if(mem.data_pointer) {
+			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		}
 
 		cuda_push_context();
 		if(mem.device_pointer)
@@ -617,7 +667,7 @@ public:
 		/* Data Storage */
 		if(interpolation == INTERPOLATION_NONE) {
 			if(has_bindless_textures) {
-				mem_alloc(mem, MEM_READ_ONLY);
+				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
 				cuda_push_context();
@@ -641,7 +691,7 @@ public:
 				cuda_pop_context();
 			}
 			else {
-				mem_alloc(mem, MEM_READ_ONLY);
+				mem_alloc(NULL, mem, MEM_READ_ONLY);
 				mem_copy_to(mem);
 
 				cuda_push_context();
@@ -1258,25 +1308,48 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();
 
-			/* keep rendering tiles until done */
-			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+			if(!use_split_kernel()) {
+				/* keep rendering tiles until done */
+				while(task->acquire_tile(this, tile)) {
+					int start_sample = tile.start_sample;
+					int end_sample = tile.start_sample + tile.num_samples;
 
-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
-					}
+					for(int sample = start_sample; sample < end_sample; sample++) {
+						if(task->get_cancel()) {
+							if(task->need_finish_queue == false)
+								break;
+						}
 
-					path_trace(tile, sample, branched);
+						path_trace(tile, sample, branched);
 
-					tile.sample = sample + 1;
+						tile.sample = sample + 1;
 
-					task->update_progress(&tile, tile.w*tile.h);
+						task->update_progress(&tile, tile.w*tile.h);
+					}
+
+					task->release_tile(tile);
+				}
+			}
+			else {
+				DeviceRequestedFeatures requested_features;
+				if(!use_adaptive_compilation()) {
+					requested_features.max_closure = 64;
 				}
 
-				task->release_tile(tile);
+				CUDASplitKernel split_kernel(this);
+				split_kernel.load_kernels(requested_features);
+
+				while(task->acquire_tile(this, tile)) {
+					device_memory void_buffer;
+					split_kernel.path_trace(task, tile, void_buffer, void_buffer);
+
+					task->release_tile(tile);
+
+					if(task->get_cancel()) {
+						if(task->need_finish_queue == false)
+							break;
+					}
+				}
 			}
 		}
 		else if(task->type == DeviceTask::SHADER) {
@@ -1329,8 +1402,223 @@ public:
 	{
 		task_pool.cancel();
 	}
+
+	friend class CUDASplitKernelFunction;
+	friend class CUDASplitKernel;
+};
+
+/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
+ * now that the definition of that class is complete
+ */
+#undef cuda_assert
+#define cuda_assert(stmt) \
+	{ \
+		CUresult result = stmt; \
+		\
+		if(result != CUDA_SUCCESS) { \
+			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			if(device->error_msg == "") \
+				device->error_msg = message; \
+			fprintf(stderr, "%s\n", message.c_str()); \
+			/*cuda_abort();*/ \
+			device->cuda_error_documentation(); \
+		} \
+	} (void)0
+
+/* split kernel */
+
+class CUDASplitKernelFunction : public SplitKernelFunction{
+	CUDADevice* device;
+	CUfunction func;
+public:
+	CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
+	{
+		return enqueue(dim, NULL);
+	}
+
+	/* enqueue the kernel, returns false if there is an error */
+	bool enqueue(const KernelDimensions &dim, void *args[])
+	{
+		device->cuda_push_context();
+
+		if(device->have_error())
+			return false;
+
+		/* we ignore dim.local_size for now, as this is faster */
+		int threads_per_block;
+		cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+		int xthreads = (int)sqrt(threads_per_block);
+		int ythreads = (int)sqrt(threads_per_block);
+
+		int xblocks = (dim.global_size[0] + xthreads - 1)/xthreads;
+		int yblocks = (dim.global_size[1] + ythreads - 1)/ythreads;
+
+		cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+		cuda_assert(cuLaunchKernel(func,
+		                           xblocks , yblocks, 1, /* blocks */
+		                           xthreads, ythreads, 1, /* threads */
+		                           0, 0, args, 0));
+
+		device->cuda_pop_context();
+
+		return !device->have_error();
+	}
 };
 
+CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
+{
+}
+
+uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+{
+	device_vector<uint64_t> size_buffer;
+	size_buffer.resize(1);
+	device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+	device->cuda_push_context();
+
+	uint threads = num_threads;
+	CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+
+	struct args_t {
+		uint* num_threads;
+		CUdeviceptr* size;
+	};
+
+	args_t args = {
+		&threads,
+		&d_size
+	};
+
+	CUfunction state_buffer_size;
+	cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+
+	cuda_assert(cuLaunchKernel(state_buffer_size,
+	                           1, 1, 1,
+	                           1, 1, 1,
+	                           0, 0, (void**)&args, 0));
+
+	device->cuda_pop_context();
+
+	device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
+	device->mem_free(size_buffer);
+
+	return *size_buffer.get_data();
+}
+
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
+                                    RenderTile& rtile,
+                                    int num_global_elements,
+                                    device_memory& /*kernel_globals*/,
+                                    device_memory& /*kernel_data*/,
+                                    device_memory& split_data,
+                                    device_memory& ray_state,
+                                    device_memory& queue_index,
+                                    device_memory& use_queues_flag,
+                                    device_memory& work_pool_wgs)
+{
+	device->cuda_push_context();
+
+	CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+	CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+	CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+	CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+	CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+	CUdeviceptr d_rng_state = device->cuda_device_ptr(rtile.rng_state);
+	CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+	int end_sample = rtile.start_sample + rtile.num_samples;
+	int queue_size = dim.global_size[0] * dim.global_size[1];
+
+	struct args_t {
+		CUdeviceptr* split_data_buffer;
+		int* num_elements;
+		CUdeviceptr* ray_state;
+		CUdeviceptr* rng_state;
+		int* start_sample;
+		int* end_sample;
+		int* sx;
+		int* sy;
+		int* sw;
+		int* sh;
+		int* offset;
+		int* stride;
+		CUdeviceptr* queue_index;
+		int* queuesize;
+		CUdeviceptr* use_queues_flag;
+		CUdeviceptr* work_pool_wgs;
+		int* num_samples;
+		CUdeviceptr* buffer;
+	};
+
+	args_t args = {
+		&d_split_data,
+		&num_global_elements,
+		&d_ray_state,
+		&d_rng_state,
+		&rtile.start_sample,
+		&end_sample,
+		&rtile.x,
+		&rtile.y,
+		&rtile.w,
+		&rtile.h,
+		&rtile.offset,
+		&rtile.stride,
+		&d_queue_index,
+		&queue_size,
+		&d_use_queues_flag,
+		&d_work_pool_wgs,
+		&rtile.num_samples,
+		&d_buffer
+	};
+
+	CUfunction data_init;
+	cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+	if(device->have_error()) {
+		return false;
+	}
+
+	CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
+
+	device->cuda_pop_context();
+
+	return !device->have_error();
+}
+
+SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&)
+{
+	CUfunction func;
+
+	device->cuda_push_context();
+
+	cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+	if(device->have_error()) {
+		device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+		return NULL;
+	}
+
+	device->cuda_pop_context();
+
+	return new CUDASplitKernelFunction(device, func);
+}
+
+int2 CUDASplitKernel::split_kernel_local_size()
+{
+	return make_int2(32, 1);
+}
+
+int2 CUDASplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask */*task*/)
+{
+	/* TODO(mai): implement something here to detect ideal work size */
+	return make_int2(256, 256);
+}
+
 bool device_cuda_init(void)
 {
 #ifdef WITH_CUDA_DYNLOAD
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b5b4dc6802..4b10514a9d2 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -28,10 +28,10 @@
  * other devices this is a pointer to device memory, where we will copy memory
  * to and from. */
 
-#include "util_debug.h"
-#include "util_half.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_debug.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,7 +48,8 @@ enum DataType {
 	TYPE_UINT,
 	TYPE_INT,
 	TYPE_FLOAT,
-	TYPE_HALF
+	TYPE_HALF,
+	TYPE_UINT64,
 };
 
 static inline size_t datatype_size(DataType datatype) 
@@ -59,6 +60,7 @@ static inline size_t datatype_size(DataType datatype)
 		case TYPE_UINT: return sizeof(uint);
 		case TYPE_INT: return sizeof(int);
 		case TYPE_HALF: return sizeof(half);
+		case TYPE_UINT64: return sizeof(uint64_t);
 		default: return 0;
 	}
 }
@@ -160,6 +162,11 @@ template<> struct device_type_traits<half4> {
 	static const int num_elements = 4;
 };
 
+template<> struct device_type_traits<uint64_t> {
+	static const DataType data_type = TYPE_UINT64;
+	static const int num_elements = 1;
+};
+
 /* Device Memory */
 
 class device_memory
@@ -180,10 +187,27 @@ public:
 	/* device pointer */
 	device_ptr device_pointer;
 
-protected:
-	device_memory() {}
+	device_memory()
+	{
+		data_type = device_type_traits<uchar>::data_type;
+		data_elements = device_type_traits<uchar>::num_elements;
+		data_pointer = 0;
+		data_size = 0;
+		device_size = 0;
+		data_width = 0;
+		data_height = 0;
+		data_depth = 0;
+		device_pointer = 0;
+	}
 	virtual ~device_memory() { assert(!device_pointer); }
 
+	void resize(size_t size)
+	{
+		data_size = size;
+		data_width = size;
+	}
+
+protected:
 	/* no copying */
 	device_memory(const device_memory&);
 	device_memory& operator = (const device_memory&);
@@ -198,16 +222,8 @@ public:
 	{
 		data_type = device_type_traits<T>::data_type;
 		data_elements = device_type_traits<T>::num_elements;
-		data_pointer = 0;
-		data_size = 0;
-		device_size = 0;
-		data_width = 0;
-		data_height = 0;
-		data_depth = 0;
 
 		assert(data_elements > 0);
-
-		device_pointer = 0;
 	}
 
 	virtual ~device_vector() {}
@@ -266,6 +282,7 @@ public:
 		data_height = 0;
 		data_depth = 0;
 		data_size = 0;
+		device_pointer = 0;
 	}
 
 	size_t size()
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 31b800640d3..624260a81c8 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -17,17 +17,17 @@
 #include <stdlib.h>
 #include <sstream>
 
-#include "device.h"
-#include "device_intern.h"
-#include "device_network.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_network.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_foreach.h"
-#include "util_list.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_time.h"
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -106,11 +106,11 @@ public:
 		return true;
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = 0;
-			sub.device->mem_alloc(mem, type);
+			sub.device->mem_alloc(name, mem, type);
 			sub.ptr_map[unique_ptr] = mem.device_pointer;
 		}
 
@@ -162,6 +162,7 @@ public:
 	void mem_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
+		stats.mem_free(mem.device_size);
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -170,7 +171,6 @@ public:
 		}
 
 		mem.device_pointer = 0;
-		stats.mem_free(mem.device_size);
 	}
 
 	void const_copy_to(const char *name, void *host, size_t size)
@@ -202,6 +202,7 @@ public:
 	void tex_free(device_memory& mem)
 	{
 		device_ptr tmp = mem.device_pointer;
+		stats.mem_free(mem.device_size);
 
 		foreach(SubDevice& sub, devices) {
 			mem.device_pointer = sub.ptr_map[tmp];
@@ -210,7 +211,6 @@ public:
 		}
 
 		mem.device_pointer = 0;
-		stats.mem_free(mem.device_size);
 	}
 
 	void pixels_alloc(device_memory& mem)
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 53eef6cf199..66758954f44 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "device_intern.h"
-#include "device_network.h"
+#include "device/device.h"
+#include "device/device_intern.h"
+#include "device/device_network.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 #if defined(WITH_NETWORK)
 
@@ -87,8 +87,14 @@ public:
 		snd.write();
 	}
 
-	void mem_alloc(device_memory& mem, MemoryType type)
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type)
 	{
+		if(name) {
+			VLOG(1) << "Buffer allocate: " << name << ", "
+				    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+				    << string_human_readable_size(mem.memory_size()) << ")";
+		}
+
 		thread_scoped_lock lock(rpc_lock);
 
 		mem.device_pointer = ++mem_counter;
@@ -481,7 +487,7 @@ protected:
 				mem.data_pointer = 0;
 
 			/* perform the allocation on the actual device */
-			device->mem_alloc(mem, type);
+			device->mem_alloc(NULL, mem, type);
 
 			/* store a mapping to/from client_pointer and real device pointer */
 			pointer_mapping_insert(client_pointer, mem.device_pointer);
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index d28cfe3121f..a5d24c66018 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -33,12 +33,12 @@
 #include <sstream>
 #include <deque>
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_foreach.h"
-#include "util_list.h"
-#include "util_map.h"
-#include "util_string.h"
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index ba94c592a5f..edd2047debc 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,12 +16,12 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl/opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "device_intern.h"
+#include "device/device_intern.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
new file mode 100644
index 00000000000..ae462a560b7
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_split_kernel.h"
+
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
+
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+static const double alpha = 0.1; /* alpha for rolling average */
+
+DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
+{
+	current_max_closure = -1;
+	first_tile = true;
+
+	avg_time_per_sample = 0.0;
+
+	kernel_path_init = NULL;
+	kernel_scene_intersect = NULL;
+	kernel_lamp_emission = NULL;
+	kernel_do_volume = NULL;
+	kernel_queue_enqueue = NULL;
+	kernel_indirect_background = NULL;
+	kernel_shader_eval = NULL;
+	kernel_holdout_emission_blurring_pathtermination_ao = NULL;
+	kernel_subsurface_scatter = NULL;
+	kernel_direct_lighting = NULL;
+	kernel_shadow_blocked_ao = NULL;
+	kernel_shadow_blocked_dl = NULL;
+	kernel_next_iteration_setup = NULL;
+	kernel_indirect_subsurface = NULL;
+	kernel_buffer_update = NULL;
+}
+
+DeviceSplitKernel::~DeviceSplitKernel()
+{
+	device->mem_free(split_data);
+	device->mem_free(ray_state);
+	device->mem_free(use_queues_flag);
+	device->mem_free(queue_index);
+	device->mem_free(work_pool_wgs);
+
+	delete kernel_path_init;
+	delete kernel_scene_intersect;
+	delete kernel_lamp_emission;
+	delete kernel_do_volume;
+	delete kernel_queue_enqueue;
+	delete kernel_indirect_background;
+	delete kernel_shader_eval;
+	delete kernel_holdout_emission_blurring_pathtermination_ao;
+	delete kernel_subsurface_scatter;
+	delete kernel_direct_lighting;
+	delete kernel_shadow_blocked_ao;
+	delete kernel_shadow_blocked_dl;
+	delete kernel_next_iteration_setup;
+	delete kernel_indirect_subsurface;
+	delete kernel_buffer_update;
+}
+
+bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
+{
+#define LOAD_KERNEL(name) \
+		kernel_##name = get_split_kernel_function(#name, requested_features); \
+		if(!kernel_##name) { \
+			return false; \
+		}
+
+	LOAD_KERNEL(path_init);
+	LOAD_KERNEL(scene_intersect);
+	LOAD_KERNEL(lamp_emission);
+	LOAD_KERNEL(do_volume);
+	LOAD_KERNEL(queue_enqueue);
+	LOAD_KERNEL(indirect_background);
+	LOAD_KERNEL(shader_eval);
+	LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+	LOAD_KERNEL(subsurface_scatter);
+	LOAD_KERNEL(direct_lighting);
+	LOAD_KERNEL(shadow_blocked_ao);
+	LOAD_KERNEL(shadow_blocked_dl);
+	LOAD_KERNEL(next_iteration_setup);
+	LOAD_KERNEL(indirect_subsurface);
+	LOAD_KERNEL(buffer_update);
+
+#undef LOAD_KERNEL
+
+	current_max_closure = requested_features.max_closure;
+
+	return true;
+}
+
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
+{
+	uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+	return max_buffer_size / size_per_element;
+}
+
+bool DeviceSplitKernel::path_trace(DeviceTask *task,
+                                   RenderTile& tile,
+                                   device_memory& kgbuffer,
+                                   device_memory& kernel_data)
+{
+	if(device->have_error()) {
+		return false;
+	}
+
+	/* Get local size */
+	size_t local_size[2];
+	{
+		int2 lsize = split_kernel_local_size();
+		local_size[0] = lsize[0];
+		local_size[1] = lsize[1];
+	}
+
+	/* Set gloabl size */
+	size_t global_size[2];
+	{
+		int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+
+		/* Make sure that set work size is a multiple of local
+		 * work size dimensions.
+		 */
+		global_size[0] = round_up(gsize[0], local_size[0]);
+		global_size[1] = round_up(gsize[1], local_size[1]);
+	}
+
+	/* Number of elements in the global state buffer */
+	int num_global_elements = global_size[0] * global_size[1];
+	assert(num_global_elements % WORK_POOL_SIZE == 0);
+
+	/* Allocate all required global memory once. */
+	if(first_tile) {
+		first_tile = false;
+
+		/* Calculate max groups */
+
+		/* Denotes the maximum work groups possible w.r.t. current requested tile size. */
+		unsigned int max_work_groups = num_global_elements / WORK_POOL_SIZE + 1;
+
+		/* Allocate work_pool_wgs memory. */
+		work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+		device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
+
+		queue_index.resize(NUM_QUEUES * sizeof(int));
+		device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
+
+		use_queues_flag.resize(sizeof(char));
+		device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
+
+		ray_state.resize(num_global_elements);
+		device->mem_alloc("ray_state", ray_state, MEM_READ_WRITE);
+
+		split_data.resize(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
+		device->mem_alloc("split_data", split_data, MEM_READ_WRITE);
+	}
+
+#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
+		if(device->have_error()) { \
+			return false; \
+		} \
+		if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
+			return false; \
+		}
+
+	tile.sample = tile.start_sample;
+
+	/* for exponential increase between tile updates */
+	int time_multiplier = 1;
+
+	while(tile.sample < tile.start_sample + tile.num_samples) {
+		/* to keep track of how long it takes to run a number of samples */
+		double start_time = time_dt();
+
+		/* initial guess to start rolling average */
+		const int initial_num_samples = 1;
+		/* approx number of samples per second */
+		int samples_per_second = (avg_time_per_sample > 0.0) ?
+		                         int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
+
+		RenderTile subtile = tile;
+		subtile.start_sample = tile.sample;
+		subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
+
+		if(device->have_error()) {
+			return false;
+		}
+
+		/* reset state memory here as global size for data_init
+		 * kernel might not be large enough to do in kernel
+		 */
+		device->mem_zero(work_pool_wgs);
+		device->mem_zero(split_data);
+		device->mem_zero(ray_state);
+
+		if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+		                                   subtile,
+		                                   num_global_elements,
+		                                   kgbuffer,
+		                                   kernel_data,
+		                                   split_data,
+		                                   ray_state,
+		                                   queue_index,
+		                                   use_queues_flag,
+		                                   work_pool_wgs))
+		{
+			return false;
+		}
+
+		ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
+
+		bool activeRaysAvailable = true;
+
+		while(activeRaysAvailable) {
+			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
+			for(int PathIter = 0; PathIter < 16; PathIter++) {
+				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+				ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
+
+				if(task->get_cancel()) {
+					return true;
+				}
+			}
+
+			/* Decide if we should exit path-iteration in host. */
+			device->mem_copy_from(ray_state, 0, global_size[0] * global_size[1] * sizeof(char), 1, 1);
+
+			activeRaysAvailable = false;
+
+			for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
+				int8_t state = ray_state.get_data()[rayStateIter];
+
+				if(state != RAY_INACTIVE) {
+					if(state == RAY_INVALID) {
+						/* Something went wrong, abort to avoid looping endlessly. */
+						device->set_error("Split kernel error: invalid ray state");
+						return false;
+					}
+
+					/* Not all rays are RAY_INACTIVE. */
+					activeRaysAvailable = true;
+					break;
+				}
+			}
+
+			if(task->get_cancel()) {
+				return true;
+			}
+		}
+
+		double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
+
+		if(avg_time_per_sample == 0.0) {
+			/* start rolling average */
+			avg_time_per_sample = time_per_sample;
+		}
+		else {
+			avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
+		}
+
+#undef ENQUEUE_SPLIT_KERNEL
+
+		tile.sample += subtile.num_samples;
+		task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
+
+		time_multiplier = min(time_multiplier << 1, 10);
+
+		if(task->get_cancel()) {
+			return true;
+		}
+	}
+
+	return true;
+}
+
+CCL_NAMESPACE_END
+
+
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
new file mode 100644
index 00000000000..15a94953a11
--- /dev/null
+++ b/intern/cycles/device/device_split_kernel.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_SPLIT_KERNEL_H__
+#define __DEVICE_SPLIT_KERNEL_H__
+
+#include "device/device.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+
+/* Types used for split kernel */
+
+class KernelDimensions {
+public:
+	size_t global_size[2];
+	size_t local_size[2];
+
+	KernelDimensions(size_t global_size_[2], size_t local_size_[2])
+	{
+		memcpy(global_size, global_size_, sizeof(global_size));
+		memcpy(local_size, local_size_, sizeof(local_size));
+	}
+};
+
+class SplitKernelFunction {
+public:
+	virtual ~SplitKernelFunction() {}
+
+	/* enqueue the kernel, returns false if there is an error */
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
+};
+
+class DeviceSplitKernel {
+private:
+	Device *device;
+
+	SplitKernelFunction *kernel_path_init;
+	SplitKernelFunction *kernel_scene_intersect;
+	SplitKernelFunction *kernel_lamp_emission;
+	SplitKernelFunction *kernel_do_volume;
+	SplitKernelFunction *kernel_queue_enqueue;
+	SplitKernelFunction *kernel_indirect_background;
+	SplitKernelFunction *kernel_shader_eval;
+	SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
+	SplitKernelFunction *kernel_subsurface_scatter;
+	SplitKernelFunction *kernel_direct_lighting;
+	SplitKernelFunction *kernel_shadow_blocked_ao;
+	SplitKernelFunction *kernel_shadow_blocked_dl;
+	SplitKernelFunction *kernel_next_iteration_setup;
+	SplitKernelFunction *kernel_indirect_subsurface;
+	SplitKernelFunction *kernel_buffer_update;
+
+	/* Global memory variables [porting]; These memory is used for
+	 * co-operation between different kernels; Data written by one
+	 * kernel will be available to another kernel via this global
+	 * memory.
+	 */
+	device_memory split_data;
+	device_vector<uchar> ray_state;
+	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+
+	/* Flag to make sceneintersect and lampemission kernel use queues. */
+	device_memory use_queues_flag;
+
+	/* Approximate time it takes to complete one sample */
+	double avg_time_per_sample;
+
+	/* Work pool with respect to each work group. */
+	device_memory work_pool_wgs;
+
+	/* clos_max value for which the kernels have been loaded currently. */
+	int current_max_closure;
+
+	/* Marked True in constructor and marked false at the end of path_trace(). */
+	bool first_tile;
+
+public:
+	explicit DeviceSplitKernel(Device* device);
+	virtual ~DeviceSplitKernel();
+
+	bool load_kernels(const DeviceRequestedFeatures& requested_features);
+	bool path_trace(DeviceTask *task,
+	                RenderTile& rtile,
+	                device_memory& kgbuffer,
+	                device_memory& kernel_data);
+
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
+	size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
+
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data_,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs) = 0;
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name, const DeviceRequestedFeatures&) = 0;
+	virtual int2 split_kernel_local_size() = 0;
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_SPLIT_KERNEL_H__ */
+
+
+
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 48d18035c13..ca303365627 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -17,12 +17,12 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "device_task.h"
+#include "device/device_task.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "util_algorithm.h"
-#include "util_time.h"
+#include "util/util_algorithm.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 8bd54c3d2b0..feee89fd6e4 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -17,11 +17,11 @@
 #ifndef __DEVICE_TASK_H__
 #define __DEVICE_TASK_H__
 
-#include "device_memory.h"
+#include "device/device_memory.h"
 
-#include "util_function.h"
-#include "util_list.h"
-#include "util_task.h"
+#include "util/util_function.h"
+#include "util/util_list.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -51,6 +51,8 @@ public:
 	int shader_filter;
 	int shader_x, shader_w;
 
+	int passes_size;
+
 	explicit DeviceTask(Type type = PATH_TRACE);
 
 	int get_subtask_count(int num, int max_size = 0);
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 4023ba89a10..764216d0dfa 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -16,39 +16,39 @@
 
 #ifdef WITH_OPENCL
 
-#include "device.h"
+#include "device/device.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
 
 #include "clew.h"
 
 CCL_NAMESPACE_BEGIN
 
-#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-/* Macro declarations used with split kernel */
-
-/* Macro to enable/disable work-stealing */
-#define __WORK_STEALING__
-
-#define SPLIT_KERNEL_LOCAL_SIZE_X 64
-#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
+#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
+/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
+#  undef clEnqueueNDRangeKernel
+#  define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+
+#  undef clEnqueueWriteBuffer
+#  define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+
+#  undef clEnqueueReadBuffer
+#  define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
+	clFinish(a); \
+	CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
+	clFinish(a);
+#endif  /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
 
-/* This value may be tuned according to the scene we are rendering.
- *
- * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
- * ray-bounces will improve performance.
- */
-#define PATH_ITER_INC_FACTOR 8
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
 
 struct OpenCLPlatformDevice {
 	OpenCLPlatformDevice(cl_platform_id platform_id,
@@ -90,6 +90,54 @@ public:
 	                              cl_device_id device_id);
 	static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
 	                               bool force_all = false);
+	static bool use_single_program();
+
+	/* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
+
+	/* Platform information. */
+	static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
+	static cl_uint get_num_platforms();
+
+	static bool get_platforms(vector<cl_platform_id> *platform_ids,
+	                          cl_int *error = NULL);
+	static vector<cl_platform_id> get_platforms();
+
+	static bool get_platform_name(cl_platform_id platform_id,
+	                              string *platform_name);
+	static string get_platform_name(cl_platform_id platform_id);
+
+	static bool get_num_platform_devices(cl_platform_id platform_id,
+	                                     cl_device_type device_type,
+	                                     cl_uint *num_devices,
+	                                     cl_int *error = NULL);
+	static cl_uint get_num_platform_devices(cl_platform_id platform_id,
+	                                        cl_device_type device_type);
+
+	static bool get_platform_devices(cl_platform_id platform_id,
+	                                 cl_device_type device_type,
+	                                 vector<cl_device_id> *device_ids,
+	                                 cl_int* error = NULL);
+	static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
+	                                                 cl_device_type device_type);
+
+	/* Device information. */
+	static bool get_device_name(cl_device_id device_id,
+	                            string *device_name,
+	                            cl_int* error = NULL);
+
+	static string get_device_name(cl_device_id device_id);
+
+	static bool get_device_type(cl_device_id device_id,
+	                            cl_device_type *device_type,
+	                            cl_int* error = NULL);
+	static cl_device_type get_device_type(cl_device_id device_id);
+
+	/* Get somewhat more readable device name.
+	 * Main difference is AMD OpenCL here which only gives code name
+	 * for the regular device name. This will give more sane device
+	 * name using some extensions.
+	 */
+	static string get_readable_device_name(cl_device_id device_id);
 };
 
 /* Thread safe cache for contexts and programs.
@@ -248,6 +296,7 @@ public:
 
 	bool device_initialized;
 	string platform_name;
+	string device_name;
 
 	bool opencl_error(cl_int err);
 	void opencl_error(const string& message);
@@ -266,10 +315,10 @@ public:
 
 	/* Has to be implemented by the real device classes.
 	 * The base device will then load all these programs. */
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
 	                          vector<OpenCLProgram*> &programs) = 0;
 
-	void mem_alloc(device_memory& mem, MemoryType type);
+	void mem_alloc(const char *name, device_memory& mem, MemoryType type);
 	void mem_copy_to(device_memory& mem);
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
@@ -326,16 +375,39 @@ protected:
 
 	class ArgumentWrapper {
 	public:
-		ArgumentWrapper() : size(0), pointer(NULL) {}
-		template <typename T>
+		ArgumentWrapper() : size(0), pointer(NULL)
+		{
+		}
+
+		ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
+		                                           pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
+		ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
+		                                              pointer((void*)(&argument.device_pointer))
+		{
+		}
+
+		template<typename T>
 		ArgumentWrapper(T& argument) : size(sizeof(argument)),
-		                               pointer(&argument) { }
+		                               pointer(&argument)
+		{
+		}
+
 		ArgumentWrapper(int argument) : size(sizeof(int)),
 		                                int_value(argument),
-		                                pointer(&int_value) { }
+		                                pointer(&int_value)
+		{
+		}
+
 		ArgumentWrapper(float argument) : size(sizeof(float)),
 		                                  float_value(argument),
-		                                  pointer(&float_value) { }
+		                                  pointer(&float_value)
+		{
+		}
+
 		size_t size;
 		int int_value;
 		float float_value;
diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp
index a2b900312e7..52d0662a8e3 100644
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -16,15 +16,15 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -82,9 +82,10 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	cpPlatform = platform_device.platform_id;
 	cdDevice = platform_device.device_id;
 	platform_name = platform_device.platform_name;
+	device_name = platform_device.device_name;
 	VLOG(2) << "Creating new Cycles device for OpenCL platform "
 	        << platform_name << ", device "
-	        << platform_device.device_name << ".";
+	        << device_name << ".";
 
 	{
 		/* try to use cached context */
@@ -113,12 +114,16 @@ OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool backgrou
 	}
 
 	cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating command queue");
 		return;
+	}
 
 	null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
-	if(opencl_error(ciErr))
+	if(opencl_error(ciErr)) {
+		opencl_error("OpenCL: Error creating memory buffer for NULL");
 		return;
+	}
 
 	fprintf(stderr, "Device init success\n");
 	device_initialized = true;
@@ -147,10 +152,8 @@ OpenCLDeviceBase::~OpenCLDeviceBase()
 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
 	const void * /*private_info*/, size_t /*cb*/, void *user_data)
 {
-	char name[256];
-	clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-
-	fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
+	string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
+	fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
 }
 
 bool OpenCLDeviceBase::opencl_version_check()
@@ -191,6 +194,8 @@ string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
 
 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
 {
+	VLOG(2) << "Loading kernels for platform " << platform_name
+	        << ", device " << device_name << ".";
 	/* Verify if device was initialized. */
 	if(!device_initialized) {
 		fprintf(stderr, "OpenCL: failed to initialize device.\n");
@@ -206,11 +211,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program.add_kernel(ustring("convert_to_half_float"));
 	base_program.add_kernel(ustring("shader"));
 	base_program.add_kernel(ustring("bake"));
+	base_program.add_kernel(ustring("zero_buffer"));
 
 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
 	/* Call actual class to fill the vector with its programs. */
-	load_kernels(requested_features, programs);
+	if(!load_kernels(requested_features, programs)) {
+		return false;
+	}
 
 	/* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
 	 * serialize the calls internally, so it's not much use right now.
@@ -242,8 +250,14 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	return true;
 }
 
-void OpenCLDeviceBase::mem_alloc(device_memory& mem, MemoryType type)
+void OpenCLDeviceBase::mem_alloc(const char *name, device_memory& mem, MemoryType type)
 {
+	if(name) {
+		VLOG(1) << "Buffer allocate: " << name << ", "
+			    << string_human_readable_number(mem.memory_size()) << " bytes. ("
+			    << string_human_readable_size(mem.memory_size()) << ")";
+	}
+
 	size_t size = mem.memory_size();
 
 	cl_mem_flags mem_flag;
@@ -311,8 +325,61 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 void OpenCLDeviceBase::mem_zero(device_memory& mem)
 {
 	if(mem.device_pointer) {
-		memset((void*)mem.data_pointer, 0, mem.memory_size());
-		mem_copy_to(mem);
+		if(base_program.is_loaded()) {
+			cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+			size_t global_size[] = {1024, 1024};
+			size_t num_threads = global_size[0] * global_size[1];
+
+			cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
+			cl_ulong d_offset = 0;
+			cl_ulong d_size = 0;
+
+			while(d_offset < mem.memory_size()) {
+				d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
+
+				kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+				ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+				                               ckZeroBuffer,
+				                               2,
+				                               NULL,
+				                               global_size,
+				                               NULL,
+				                               0,
+				                               NULL,
+				                               NULL);
+				opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+				d_offset += d_size;
+			}
+		}
+
+		if(mem.data_pointer) {
+			memset((void*)mem.data_pointer, 0, mem.memory_size());
+		}
+
+		if(!base_program.is_loaded()) {
+			void* zero = (void*)mem.data_pointer;
+
+			if(!mem.data_pointer) {
+				zero = util_aligned_malloc(mem.memory_size(), 16);
+				memset(zero, 0, mem.memory_size());
+			}
+
+			opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+			                                   CL_MEM_PTR(mem.device_pointer),
+			                                   CL_TRUE,
+			                                   0,
+			                                   mem.memory_size(),
+			                                   zero,
+			                                   0,
+			                                   NULL, NULL));
+
+			if(!mem.data_pointer) {
+				util_aligned_free(zero);
+			}
+		}
 	}
 }
 
@@ -337,7 +404,7 @@ void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 		device_vector<uchar> *data = new device_vector<uchar>();
 		data->copy((uchar*)host, size);
 
-		mem_alloc(*data, MEM_READ_ONLY);
+		mem_alloc(name, *data, MEM_READ_ONLY);
 		i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
 	}
 	else {
@@ -356,7 +423,7 @@ void OpenCLDeviceBase::tex_alloc(const char *name,
 	VLOG(1) << "Texture allocate: " << name << ", "
 	        << string_human_readable_number(mem.memory_size()) << " bytes. ("
 	        << string_human_readable_size(mem.memory_size()) << ")";
-	mem_alloc(mem, MEM_READ_ONLY);
+	mem_alloc(NULL, mem, MEM_READ_ONLY);
 	mem_copy_to(mem);
 	assert(mem_map.find(name) == mem_map.end());
 	mem_map.insert(MemMap::value_type(name, mem.device_pointer));
@@ -460,7 +527,7 @@ void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_
 
 #define KERNEL_TEX(type, ttype, name) \
 set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
 
 	start_arg_index += kernel_set_args(ckFilmConvertKernel,
@@ -511,7 +578,7 @@ void OpenCLDeviceBase::shader(DeviceTask& task)
 
 #define KERNEL_TEX(type, ttype, name) \
 	set_kernel_arg_mem(kernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
 
 	start_arg_index += kernel_set_args(kernel,
diff --git a/intern/cycles/device/opencl/opencl_mega.cpp b/intern/cycles/device/opencl/opencl_mega.cpp
index 6ea7619e022..a2fd1d71156 100644
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -16,15 +16,15 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,11 +43,12 @@ public:
 		return true;
 	}
 
-	virtual void load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
+	virtual bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/,
 	                          vector<OpenCLProgram*> &programs)
 	{
 		path_trace_program.add_kernel(ustring("path_trace"));
 		programs.push_back(&path_trace_program);
+		return true;
 	}
 
 	~OpenCLDeviceMegaKernel()
@@ -83,7 +84,7 @@ public:
 
 #define KERNEL_TEX(type, ttype, name) \
 		set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
 
 		start_arg_index += kernel_set_args(ckPathTraceKernel,
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 3c3c2150128..579dbc84f53 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -16,1290 +16,359 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "buffers.h"
+#include "render/buffers.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
 
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "device/device_split_kernel.h"
+
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* TODO(sergey): This is to keep tile split on OpenCL level working
- * for now, since without this view-port render does not work as it
- * should.
- *
- * Ideally it'll be done on the higher level, but we need to get ready
- * for merge rather soon, so let's keep split logic private here in
- * the file.
- */
-class SplitRenderTile : public RenderTile {
-public:
-	SplitRenderTile()
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0) {}
-
-	explicit SplitRenderTile(RenderTile& tile)
-		: RenderTile(),
-		  buffer_offset_x(0),
-		  buffer_offset_y(0),
-		  rng_state_offset_x(0),
-		  rng_state_offset_y(0),
-		  buffer_rng_state_stride(0)
-	{
-		x = tile.x;
-		y = tile.y;
-		w = tile.w;
-		h = tile.h;
-		start_sample = tile.start_sample;
-		num_samples = tile.num_samples;
-		sample = tile.sample;
-		resolution = tile.resolution;
-		offset = tile.offset;
-		stride = tile.stride;
-		buffer = tile.buffer;
-		rng_state = tile.rng_state;
-		buffers = tile.buffers;
+class OpenCLSplitKernel;
+
+static string get_build_options(OpenCLDeviceBase *device, const DeviceRequestedFeatures& requested_features)
+{
+	string build_options = "-D__SPLIT_KERNEL__ ";
+	build_options += requested_features.get_build_options();
+
+	/* Set compute device build option. */
+	cl_device_type device_type;
+	OpenCLInfo::get_device_type(device->cdDevice, &device_type, &device->ciErr);
+	assert(device->ciErr == CL_SUCCESS);
+	if(device_type == CL_DEVICE_TYPE_GPU) {
+		build_options += " -D__COMPUTE_DEVICE_GPU__";
 	}
 
-	/* Split kernel is device global memory constrained;
-	 * hence split kernel cant render big tile size's in
-	 * one go. If the user sets a big tile size (big tile size
-	 * is a term relative to the available device global memory),
-	 * we split the tile further and then call path_trace on
-	 * each of those split tiles. The following variables declared,
-	 * assist in achieving that purpose
-	 */
-	int buffer_offset_x;
-	int buffer_offset_y;
-	int rng_state_offset_x;
-	int rng_state_offset_y;
-	int buffer_rng_state_stride;
-};
+	return build_options;
+}
 
 /* OpenCLDeviceSplitKernel's declaration/definition. */
 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
 {
 public:
-	/* Kernel declaration. */
+	DeviceSplitKernel *split_kernel;
 	OpenCLProgram program_data_init;
-	OpenCLProgram program_scene_intersect;
-	OpenCLProgram program_lamp_emission;
-	OpenCLProgram program_queue_enqueue;
-	OpenCLProgram program_background_buffer_update;
-	OpenCLProgram program_shader_eval;
-	OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-	OpenCLProgram program_direct_lighting;
-	OpenCLProgram program_shadow_blocked;
-	OpenCLProgram program_next_iteration_setup;
-	OpenCLProgram program_sum_all_radiance;
-
-	/* Global memory variables [porting]; These memory is used for
-	 * co-operation between different kernels; Data written by one
-	 * kernel will be available to another kernel via this global
-	 * memory.
-	 */
-	cl_mem rng_coop;
-	cl_mem throughput_coop;
-	cl_mem L_transparent_coop;
-	cl_mem PathRadiance_coop;
-	cl_mem Ray_coop;
-	cl_mem PathState_coop;
-	cl_mem Intersection_coop;
-	cl_mem kgbuffer;  /* KernelGlobals buffer. */
-
-	/* Global buffers for ShaderData. */
-	cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
-	cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
-	                        * shadow_blocked kernel.
-	                        */
-
-	/* Global memory required for shadow blocked and accum_radiance. */
-	cl_mem BSDFEval_coop;
-	cl_mem ISLamp_coop;
-	cl_mem LightRay_coop;
-	cl_mem AOAlpha_coop;
-	cl_mem AOBSDF_coop;
-	cl_mem AOLightRay_coop;
-	cl_mem Intersection_coop_shadow;
-
-#ifdef WITH_CYCLES_DEBUG
-	/* DebugData memory */
-	cl_mem debugdata_coop;
-#endif
-
-	/* Global state array that tracks ray state. */
-	cl_mem ray_state;
-
-	/* Per sample buffers. */
-	cl_mem per_sample_output_buffers;
-
-	/* Denotes which sample each ray is being processed for. */
-	cl_mem work_array;
-
-	/* Queue */
-	cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
-	cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
-	                     * Tracks the size of each queue.
-	                     */
-
-	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	cl_mem use_queues_flag;
-
-	/* Amount of memory in output buffer associated with one pixel/thread. */
-	size_t per_thread_output_buffer_size;
-
-	/* Total allocatable available device memory. */
-	size_t total_allocatable_memory;
-
-	/* host version of ray_state; Used in checking host path-iteration
-	 * termination.
-	 */
-	char *hostRayStateArray;
-
-	/* Number of path-iterations to be done in one shot. */
-	unsigned int PathIteration_times;
-
-#ifdef __WORK_STEALING__
-	/* Work pool with respect to each work group. */
-	cl_mem work_pool_wgs;
-
-	/* Denotes the maximum work groups possible w.r.t. current tile size. */
-	unsigned int max_work_groups;
-#endif
-
-	/* clos_max value for which the kernels have been loaded currently. */
-	int current_max_closure;
-
-	/* Marked True in constructor and marked false at the end of path_trace(). */
-	bool first_tile;
-
-	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
-	: OpenCLDeviceBase(info, stats, background_)
-	{
-		background = background_;
-
-		/* Initialize cl_mem variables. */
-		kgbuffer = NULL;
-		sd = NULL;
-		sd_DL_shadow = NULL;
-
-		rng_coop = NULL;
-		throughput_coop = NULL;
-		L_transparent_coop = NULL;
-		PathRadiance_coop = NULL;
-		Ray_coop = NULL;
-		PathState_coop = NULL;
-		Intersection_coop = NULL;
-		ray_state = NULL;
-
-		AOAlpha_coop = NULL;
-		AOBSDF_coop = NULL;
-		AOLightRay_coop = NULL;
-		BSDFEval_coop = NULL;
-		ISLamp_coop = NULL;
-		LightRay_coop = NULL;
-		Intersection_coop_shadow = NULL;
-
-#ifdef WITH_CYCLES_DEBUG
-		debugdata_coop = NULL;
-#endif
-
-		work_array = NULL;
-
-		/* Queue. */
-		Queue_data = NULL;
-		Queue_index = NULL;
-		use_queues_flag = NULL;
-
-		per_sample_output_buffers = NULL;
-
-		per_thread_output_buffer_size = 0;
-		hostRayStateArray = NULL;
-		PathIteration_times = PATH_ITER_INC_FACTOR;
-#ifdef __WORK_STEALING__
-		work_pool_wgs = NULL;
-		max_work_groups = 0;
-#endif
-		current_max_closure = -1;
-		first_tile = true;
-
-		/* Get device's maximum memory that can be allocated. */
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_MAX_MEM_ALLOC_SIZE,
-		                        sizeof(size_t),
-		                        &total_allocatable_memory,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(platform_name == "AMD Accelerated Parallel Processing") {
-			/* This value is tweak-able; AMD platform does not seem to
-			 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
-			 * is considered for further computation.
-			 */
-			total_allocatable_memory /= 2;
-		}
-	}
+	OpenCLProgram program_state_buffer_size;
 
-	virtual bool show_samples() const {
-		return false;
-	}
+	OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_);
 
-	/* Split kernel utility functions. */
-	size_t get_tex_size(const char *tex_name)
+	~OpenCLDeviceSplitKernel()
 	{
-		cl_mem ptr;
-		size_t ret_size = 0;
-		MemMap::iterator i = mem_map.find(tex_name);
-		if(i != mem_map.end()) {
-			ptr = CL_MEM_PTR(i->second);
-			ciErr = clGetMemObjectInfo(ptr,
-			                           CL_MEM_SIZE,
-			                           sizeof(ret_size),
-			                           &ret_size,
-			                           NULL);
-			assert(ciErr == CL_SUCCESS);
-		}
-		return ret_size;
+		task_pool.stop();
+
+		/* Release kernels */
+		program_data_init.release();
+
+		delete split_kernel;
 	}
 
-	size_t get_shader_data_size(size_t max_closure)
+	virtual bool load_kernels(const DeviceRequestedFeatures& requested_features,
+	                          vector<OpenCLDeviceBase::OpenCLProgram*> &programs)
 	{
-		/* ShaderData size with variable size ShaderClosure array */
-		return sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - max_closure));
+		bool single_program = OpenCLInfo::use_single_program();
+		program_data_init = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  single_program ? "split" : "split_data_init",
+		                                  single_program ? "kernel_split.cl" : "kernel_data_init.cl",
+		                                  get_build_options(this, requested_features));
+
+		program_data_init.add_kernel(ustring("path_trace_data_init"));
+		programs.push_back(&program_data_init);
+
+		program_state_buffer_size = OpenCLDeviceBase::OpenCLProgram(this,
+		                                  single_program ? "split" : "split_state_buffer_size",
+		                                  single_program ? "kernel_split.cl" : "kernel_state_buffer_size.cl",
+		                                  get_build_options(this, requested_features));
+		program_state_buffer_size.add_kernel(ustring("path_trace_state_buffer_size"));
+		programs.push_back(&program_state_buffer_size);
+
+		return split_kernel->load_kernels(requested_features);
 	}
 
-	/* Returns size of KernelGlobals structure associated with OpenCL. */
-	size_t get_KernelGlobals_size()
+	void thread_run(DeviceTask *task)
 	{
-		/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
-		 * fetch its size.
-		 */
-		typedef struct KernelGlobals {
-			ccl_constant KernelData *data;
+		if(task->type == DeviceTask::FILM_CONVERT) {
+			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+		}
+		else if(task->type == DeviceTask::SHADER) {
+			shader(*task);
+		}
+		else if(task->type == DeviceTask::PATH_TRACE) {
+			RenderTile tile;
+
+			/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+			 * fetch its size.
+			 */
+			typedef struct KernelGlobals {
+				ccl_constant KernelData *data;
 #define KERNEL_TEX(type, ttype, name) \
-	ccl_global type *name;
-#include "kernel_textures.h"
+				ccl_global type *name;
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
-			void *sd_input;
-			void *isect_shadow;
-		} KernelGlobals;
+				SplitData split_data;
+				SplitParams split_param_data;
+			} KernelGlobals;
+
+			/* Allocate buffer for kernel globals */
+			device_memory kgbuffer;
+			kgbuffer.resize(sizeof(KernelGlobals));
+			mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
+
+			/* Keep rendering tiles until done. */
+			while(task->acquire_tile(this, tile)) {
+				split_kernel->path_trace(task,
+				                         tile,
+				                         kgbuffer,
+				                         *const_mem_map["__data"]);
+
+				/* Complete kernel execution before release tile. */
+				/* This helps in multi-device render;
+				 * The device that reaches the critical-section function
+				 * release_tile waits (stalling other devices from entering
+				 * release_tile) for all kernels to complete. If device1 (a
+				 * slow-render device) reaches release_tile first then it would
+				 * stall device2 (a fast-render device) from proceeding to render
+				 * next tile.
+				 */
+				clFinish(cqCommandQueue);
 
-		return sizeof(KernelGlobals);
+				task->release_tile(tile);
+			}
+
+			mem_free(kgbuffer);
+		}
+	}
+
+protected:
+	/* ** Those guys are for workign around some compiler-specific bugs ** */
+
+	string build_options_for_base_program(
+	        const DeviceRequestedFeatures& requested_features)
+	{
+		return requested_features.get_build_options();
 	}
 
-	virtual void load_kernels(const DeviceRequestedFeatures& requested_features,
-	                          vector<OpenCLProgram*> &programs)
+	friend class OpenCLSplitKernel;
+	friend class OpenCLSplitKernelFunction;
+};
+
+class OpenCLSplitKernelFunction : public SplitKernelFunction {
+public:
+	OpenCLDeviceSplitKernel* device;
+	OpenCLDeviceBase::OpenCLProgram program;
+
+	OpenCLSplitKernelFunction(OpenCLDeviceSplitKernel* device) : device(device) {}
+	~OpenCLSplitKernelFunction() { program.release(); }
+
+	virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
 	{
-		string build_options = "-D__SPLIT_KERNEL__ ";
-#ifdef __WORK_STEALING__
-		build_options += "-D__WORK_STEALING__ ";
-#endif
-		build_options += requested_features.get_build_options();
-
-		/* Set compute device build option. */
-		cl_device_type device_type;
-		ciErr = clGetDeviceInfo(cdDevice,
-		                        CL_DEVICE_TYPE,
-		                        sizeof(cl_device_type),
-		                        &device_type,
-		                        NULL);
-		assert(ciErr == CL_SUCCESS);
-		if(device_type == CL_DEVICE_TYPE_GPU) {
-			build_options += " -D__COMPUTE_DEVICE_GPU__";
+		device->kernel_set_args(program(), 0, kg, data);
+
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                                       program(),
+		                                       2,
+		                                       NULL,
+		                                       dim.global_size,
+		                                       dim.local_size,
+		                                       0,
+		                                       NULL,
+		                                       NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return false;
 		}
 
-#define GLUE(a, b) a ## b
-#define LOAD_KERNEL(name) \
-	do { \
-		GLUE(program_, name) = OpenCLProgram(this, "split_" #name, "kernel_" #name ".cl", build_options); \
-		GLUE(program_, name).add_kernel(ustring("path_trace_" #name)); \
-		programs.push_back(&GLUE(program_, name)); \
-	} while(false)
-
-		LOAD_KERNEL(data_init);
-		LOAD_KERNEL(scene_intersect);
-		LOAD_KERNEL(lamp_emission);
-		LOAD_KERNEL(queue_enqueue);
-		LOAD_KERNEL(background_buffer_update);
-		LOAD_KERNEL(shader_eval);
-		LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-		LOAD_KERNEL(direct_lighting);
-		LOAD_KERNEL(shadow_blocked);
-		LOAD_KERNEL(next_iteration_setup);
-		LOAD_KERNEL(sum_all_radiance);
-
-#undef FIND_KERNEL
-#undef GLUE
-
-		current_max_closure = requested_features.max_closure;
+		return true;
 	}
+};
 
-	~OpenCLDeviceSplitKernel()
+class OpenCLSplitKernel : public DeviceSplitKernel {
+	OpenCLDeviceSplitKernel *device;
+public:
+	explicit OpenCLSplitKernel(OpenCLDeviceSplitKernel *device) : DeviceSplitKernel(device), device(device) {
+	}
+
+	virtual SplitKernelFunction* get_split_kernel_function(string kernel_name,
+	                                                       const DeviceRequestedFeatures& requested_features)
 	{
-		task_pool.stop();
+		OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device);
 
-		/* Release kernels */
-		program_data_init.release();
-		program_scene_intersect.release();
-		program_lamp_emission.release();
-		program_queue_enqueue.release();
-		program_background_buffer_update.release();
-		program_shader_eval.release();
-		program_holdout_emission_blurring_pathtermination_ao.release();
-		program_direct_lighting.release();
-		program_shadow_blocked.release();
-		program_next_iteration_setup.release();
-		program_sum_all_radiance.release();
-
-		/* Release global memory */
-		release_mem_object_safe(rng_coop);
-		release_mem_object_safe(throughput_coop);
-		release_mem_object_safe(L_transparent_coop);
-		release_mem_object_safe(PathRadiance_coop);
-		release_mem_object_safe(Ray_coop);
-		release_mem_object_safe(PathState_coop);
-		release_mem_object_safe(Intersection_coop);
-		release_mem_object_safe(kgbuffer);
-		release_mem_object_safe(sd);
-		release_mem_object_safe(sd_DL_shadow);
-		release_mem_object_safe(ray_state);
-		release_mem_object_safe(AOAlpha_coop);
-		release_mem_object_safe(AOBSDF_coop);
-		release_mem_object_safe(AOLightRay_coop);
-		release_mem_object_safe(BSDFEval_coop);
-		release_mem_object_safe(ISLamp_coop);
-		release_mem_object_safe(LightRay_coop);
-		release_mem_object_safe(Intersection_coop_shadow);
-#ifdef WITH_CYCLES_DEBUG
-		release_mem_object_safe(debugdata_coop);
-#endif
-		release_mem_object_safe(use_queues_flag);
-		release_mem_object_safe(Queue_data);
-		release_mem_object_safe(Queue_index);
-		release_mem_object_safe(work_array);
-#ifdef __WORK_STEALING__
-		release_mem_object_safe(work_pool_wgs);
-#endif
-		release_mem_object_safe(per_sample_output_buffers);
-
-		if(hostRayStateArray != NULL) {
-			free(hostRayStateArray);
+		bool single_program = OpenCLInfo::use_single_program();
+		kernel->program =
+			OpenCLDeviceBase::OpenCLProgram(device,
+			                                single_program ? "split" : "split_" + kernel_name,
+			                                single_program ? "kernel_split.cl" : "kernel_" + kernel_name + ".cl",
+			                                get_build_options(device, requested_features));
+
+		kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
+		kernel->program.load();
+
+		if(!kernel->program.is_loaded()) {
+			delete kernel;
+			return NULL;
 		}
+
+		return kernel;
 	}
 
-	void path_trace(DeviceTask *task,
-	                SplitRenderTile& rtile,
-	                int2 max_render_feasible_tile_size)
+	virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
 	{
-		/* cast arguments to cl types */
-		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-		cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-		cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-		cl_int d_x = rtile.x;
-		cl_int d_y = rtile.y;
-		cl_int d_w = rtile.w;
-		cl_int d_h = rtile.h;
-		cl_int d_offset = rtile.offset;
-		cl_int d_stride = rtile.stride;
-
-		/* Make sure that set render feasible tile size is a multiple of local
-		 * work size dimensions.
-		 */
-		assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
-		assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+		device_vector<uint64_t> size_buffer;
+		size_buffer.resize(1);
+		device->mem_alloc(NULL, size_buffer, MEM_READ_WRITE);
+
+		uint threads = num_threads;
+		device->kernel_set_args(device->program_state_buffer_size(), 0, kg, data, threads, size_buffer);
+
+		size_t global_size = 64;
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_state_buffer_size(),
+		                               1,
+		                               NULL,
+		                               &global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		device->mem_copy_from(size_buffer, 0, 1, 1, sizeof(uint64_t));
+		device->mem_free(size_buffer);
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
+			return 0;
+		}
+
+		return *size_buffer.get_data();
+	}
 
-		size_t global_size[2];
-		size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
-		                        SPLIT_KERNEL_LOCAL_SIZE_Y};
+	virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
+	                                            RenderTile& rtile,
+	                                            int num_global_elements,
+	                                            device_memory& kernel_globals,
+	                                            device_memory& kernel_data,
+	                                            device_memory& split_data,
+	                                            device_memory& ray_state,
+	                                            device_memory& queue_index,
+	                                            device_memory& use_queues_flag,
+	                                            device_memory& work_pool_wgs
+	                                            )
+	{
+		cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
 
 		/* Set the range of samples to be processed for every ray in
 		 * path-regeneration logic.
 		 */
 		cl_int start_sample = rtile.start_sample;
 		cl_int end_sample = rtile.start_sample + rtile.num_samples;
-		cl_int num_samples = rtile.num_samples;
-
-#ifdef __WORK_STEALING__
-		global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_parallel_samples = 1;
-#else
-		global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
-		unsigned int num_threads = max_render_feasible_tile_size.x *
-		                           max_render_feasible_tile_size.y;
-		unsigned int num_tile_columns_possible = num_threads / global_size[1];
-		/* Estimate number of parallel samples that can be
-		 * processed in parallel.
-		 */
-		unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
-		                                        rtile.num_samples);
-		/* Wavefront size in AMD is 64.
-		 * TODO(sergey): What about other platforms?
-		 */
-		if(num_parallel_samples >= 64) {
-			/* TODO(sergey): Could use generic round-up here. */
-			num_parallel_samples = (num_parallel_samples / 64) * 64;
-		}
-		assert(num_parallel_samples != 0);
-
-		global_size[0] = d_w * num_parallel_samples;
-#endif  /* __WORK_STEALING__ */
-
-		assert(global_size[0] * global_size[1] <=
-		       max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
-
-		/* Allocate all required global memory once. */
-		if(first_tile) {
-			size_t num_global_elements = max_render_feasible_tile_size.x *
-			                             max_render_feasible_tile_size.y;
-			/* TODO(sergey): This will actually over-allocate if
-			 * particular kernel does not support multiclosure.
-			 */
-			size_t shaderdata_size = get_shader_data_size(current_max_closure);
-
-#ifdef __WORK_STEALING__
-			/* Calculate max groups */
-			size_t max_global_size[2];
-			size_t tile_x = max_render_feasible_tile_size.x;
-			size_t tile_y = max_render_feasible_tile_size.y;
-			max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
-			max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
-			max_work_groups = (max_global_size[0] * max_global_size[1]) /
-			                  (local_size[0] * local_size[1]);
-			/* Allocate work_pool_wgs memory. */
-			work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
-#endif  /* __WORK_STEALING__ */
-
-			/* Allocate queue_index memory only once. */
-			Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
-			use_queues_flag = mem_alloc(sizeof(char));
-			kgbuffer = mem_alloc(get_KernelGlobals_size());
-
-			/* Create global buffers for ShaderData. */
-			sd = mem_alloc(num_global_elements * shaderdata_size);
-			sd_DL_shadow = mem_alloc(num_global_elements * 2 * shaderdata_size);
-
-			/* Creation of global memory buffers which are shared among
-			 * the kernels.
-			 */
-			rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
-			throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
-			L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
-			PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
-			Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
-			Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
-			AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
-			AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
-			ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
-			LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
-			Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
-
-#ifdef WITH_CYCLES_DEBUG
-			debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
-#endif
-
-			ray_state = mem_alloc(num_global_elements * sizeof(char));
-
-			hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
-			assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
-
-			Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
-			work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
-			per_sample_output_buffers = mem_alloc(num_global_elements *
-			                                      per_thread_output_buffer_size);
-		}
-
-		cl_int dQueue_size = global_size[0] * global_size[1];
 
 		cl_uint start_arg_index =
-			kernel_set_args(program_data_init(),
+			device->kernel_set_args(device->program_data_init(),
 			                0,
-			                kgbuffer,
-			                sd_DL_shadow,
-			                d_data,
-			                per_sample_output_buffers,
-			                d_rng_state,
-			                rng_coop,
-			                throughput_coop,
-			                L_transparent_coop,
-			                PathRadiance_coop,
-			                Ray_coop,
-			                PathState_coop,
-			                Intersection_coop_shadow,
-			                ray_state);
+			                kernel_globals,
+			                kernel_data,
+			                split_data,
+			                num_global_elements,
+			                ray_state,
+			                rtile.rng_state);
 
 /* TODO(sergey): Avoid map lookup here. */
 #define KERNEL_TEX(type, ttype, name) \
-	set_kernel_arg_mem(program_data_init(), &start_arg_index, #name);
-#include "kernel_textures.h"
+	device->set_kernel_arg_mem(device->program_data_init(), &start_arg_index, #name);
+#include "kernel/kernel_textures.h"
 #undef KERNEL_TEX
 
 		start_arg_index +=
-			kernel_set_args(program_data_init(),
+			device->kernel_set_args(device->program_data_init(),
 			                start_arg_index,
 			                start_sample,
-			                d_x,
-			                d_y,
-			                d_w,
-			                d_h,
-			                d_offset,
-			                d_stride,
-			                rtile.rng_state_offset_x,
-			                rtile.rng_state_offset_y,
-			                rtile.buffer_rng_state_stride,
-			                Queue_data,
-			                Queue_index,
+			                end_sample,
+			                rtile.x,
+			                rtile.y,
+			                rtile.w,
+			                rtile.h,
+			                rtile.offset,
+			                rtile.stride,
+			                queue_index,
 			                dQueue_size,
 			                use_queues_flag,
-			                work_array,
-#ifdef __WORK_STEALING__
 			                work_pool_wgs,
-			                num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-			                debugdata_coop,
-#endif
-			                num_parallel_samples);
-
-		kernel_set_args(program_scene_intersect(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-#ifdef WITH_CYCLES_DEBUG
-		                debugdata_coop,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(program_lamp_emission(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                d_w,
-		                d_h,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag,
-		                num_parallel_samples);
-
-		kernel_set_args(program_queue_enqueue(),
-		                0,
-		                Queue_data,
-		                Queue_index,
-		                ray_state,
-		                dQueue_size);
-
-		kernel_set_args(program_background_buffer_update(),
-		                 0,
-		                 kgbuffer,
-		                 d_data,
-		                 per_sample_output_buffers,
-		                 d_rng_state,
-		                 rng_coop,
-		                 throughput_coop,
-		                 PathRadiance_coop,
-		                 Ray_coop,
-		                 PathState_coop,
-		                 L_transparent_coop,
-		                 ray_state,
-		                 d_w,
-		                 d_h,
-		                 d_x,
-		                 d_y,
-		                 d_stride,
-		                 rtile.rng_state_offset_x,
-		                 rtile.rng_state_offset_y,
-		                 rtile.buffer_rng_state_stride,
-		                 work_array,
-		                 Queue_data,
-		                 Queue_index,
-		                 dQueue_size,
-		                 end_sample,
-		                 start_sample,
-#ifdef __WORK_STEALING__
-		                 work_pool_wgs,
-		                 num_samples,
-#endif
-#ifdef WITH_CYCLES_DEBUG
-		                 debugdata_coop,
-#endif
-		                 num_parallel_samples);
-
-		kernel_set_args(program_shader_eval(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_holdout_emission_blurring_pathtermination_ao(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                per_sample_output_buffers,
-		                rng_coop,
-		                throughput_coop,
-		                L_transparent_coop,
-		                PathRadiance_coop,
-		                PathState_coop,
-		                Intersection_coop,
-		                AOAlpha_coop,
-		                AOBSDF_coop,
-		                AOLightRay_coop,
-		                d_w,
-		                d_h,
-		                d_x,
-		                d_y,
-		                d_stride,
-		                ray_state,
-		                work_array,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-#ifdef __WORK_STEALING__
-		                start_sample,
-#endif
-		                num_parallel_samples);
-
-		kernel_set_args(program_direct_lighting(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                PathState_coop,
-		                ISLamp_coop,
-		                LightRay_coop,
-		                BSDFEval_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_shadow_blocked(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                PathState_coop,
-		                LightRay_coop,
-		                AOLightRay_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size);
-
-		kernel_set_args(program_next_iteration_setup(),
-		                0,
-		                kgbuffer,
-		                d_data,
-		                sd,
-		                rng_coop,
-		                throughput_coop,
-		                PathRadiance_coop,
-		                Ray_coop,
-		                PathState_coop,
-		                LightRay_coop,
-		                ISLamp_coop,
-		                BSDFEval_coop,
-		                AOLightRay_coop,
-		                AOBSDF_coop,
-		                AOAlpha_coop,
-		                ray_state,
-		                Queue_data,
-		                Queue_index,
-		                dQueue_size,
-		                use_queues_flag);
-
-		kernel_set_args(program_sum_all_radiance(),
-		                0,
-		                d_data,
-		                d_buffer,
-		                per_sample_output_buffers,
-		                num_parallel_samples,
-		                d_w,
-		                d_h,
-		                d_stride,
-		                rtile.buffer_offset_x,
-		                rtile.buffer_offset_y,
-		                rtile.buffer_rng_state_stride,
-		                start_sample);
-
-		/* Macro for Enqueuing split kernels. */
-#define GLUE(a, b) a ## b
-#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
-		{ \
-			ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
-			                               GLUE(program_, \
-			                                    kernelName)(), \
-			                               2, \
-			                               NULL, \
-			                               globalSize, \
-			                               localSize, \
-			                               0, \
-			                               NULL, \
-			                               NULL); \
-			opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
-			if(ciErr != CL_SUCCESS) { \
-				string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
-				                               clewErrorString(ciErr)); \
-				opencl_error(message); \
-				return; \
-			} \
-		} (void) 0
+			                rtile.num_samples,
+			                rtile.buffer);
 
 		/* Enqueue ckPathTraceKernel_data_init kernel. */
-		ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
-		bool activeRaysAvailable = true;
-
-		/* Record number of time host intervention has been made */
-		unsigned int numHostIntervention = 0;
-		unsigned int numNextPathIterTimes = PathIteration_times;
-		bool canceled = false;
-		while(activeRaysAvailable) {
-			/* Twice the global work size of other kernels for
-			 * ckPathTraceKernel_shadow_blocked_direct_lighting. */
-			size_t global_size_shadow_blocked[2];
-			global_size_shadow_blocked[0] = global_size[0] * 2;
-			global_size_shadow_blocked[1] = global_size[1];
-
-			/* Do path-iteration in host [Enqueue Path-iteration kernels. */
-			for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
-				ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-				ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
-				ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-
-				if(task->get_cancel()) {
-					canceled = true;
-					break;
-				}
-			}
-
-			/* Read ray-state into Host memory to decide if we should exit
-			 * path-iteration in host.
-			 */
-			ciErr = clEnqueueReadBuffer(cqCommandQueue,
-			                            ray_state,
-			                            CL_TRUE,
-			                            0,
-			                            global_size[0] * global_size[1] * sizeof(char),
-			                            hostRayStateArray,
-			                            0,
-			                            NULL,
-			                            NULL);
-			assert(ciErr == CL_SUCCESS);
-
-			activeRaysAvailable = false;
-
-			for(int rayStateIter = 0;
-			    rayStateIter < global_size[0] * global_size[1];
-			    ++rayStateIter)
-			{
-				if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
-					/* Not all rays are RAY_INACTIVE. */
-					activeRaysAvailable = true;
-					break;
-				}
-			}
-
-			if(activeRaysAvailable) {
-				numHostIntervention++;
-				PathIteration_times = PATH_ITER_INC_FACTOR;
-				/* Host intervention done before all rays become RAY_INACTIVE;
-				 * Set do more initial iterations for the next tile.
-				 */
-				numNextPathIterTimes += PATH_ITER_INC_FACTOR;
-			}
-
-			if(task->get_cancel()) {
-				canceled = true;
-				break;
-			}
-		}
-
-		/* Execute SumALLRadiance kernel to accumulate radiance calculated in
-		 * per_sample_output_buffers into RenderTile's output buffer.
-		 */
-		if(!canceled) {
-			size_t sum_all_radiance_local_size[2] = {16, 16};
-			size_t sum_all_radiance_global_size[2];
-			sum_all_radiance_global_size[0] =
-				(((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
-				sum_all_radiance_local_size[0];
-			sum_all_radiance_global_size[1] =
-				(((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
-				sum_all_radiance_local_size[1];
-			ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
-			                     sum_all_radiance_global_size,
-			                     sum_all_radiance_local_size);
-		}
-
-#undef ENQUEUE_SPLIT_KERNEL
-#undef GLUE
-
-		if(numHostIntervention == 0) {
-			/* This means that we are executing kernel more than required
-			 * Must avoid this for the next sample/tile.
-			 */
-			PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
-			PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
-		}
-		else {
-			/* Number of path-iterations done for this tile is set as
-			 * Initial path-iteration times for the next tile
-			 */
-			PathIteration_times = numNextPathIterTimes;
-		}
-
-		first_tile = false;
-	}
-
-	/* Calculates the amount of memory that has to be always
-	 * allocated in order for the split kernel to function.
-	 * This memory is tile/scene-property invariant (meaning,
-	 * the value returned by this function does not depend
-	 * on the user set tile size or scene properties.
-	 */
-	size_t get_invariable_mem_allocated()
-	{
-		size_t total_invariable_mem_allocated = 0;
-		size_t KernelGlobals_size = 0;
-
-		KernelGlobals_size = get_KernelGlobals_size();
-
-		total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
-		total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
-		total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
-
-		return total_invariable_mem_allocated;
-	}
-
-	/* Calculate the memory that has-to-be/has-been allocated for
-	 * the split kernel to function.
-	 */
-	size_t get_tile_specific_mem_allocated(const int2 tile_size)
-	{
-		size_t tile_specific_mem_allocated = 0;
-
-		/* Get required tile info */
-		unsigned int user_set_tile_w = tile_size.x;
-		unsigned int user_set_tile_h = tile_size.y;
-
-#ifdef __WORK_STEALING__
-		/* Calculate memory to be allocated for work_pools in
-		 * case of work_stealing.
-		 */
-		size_t max_global_size[2];
-		size_t max_num_work_pools = 0;
-		max_global_size[0] =
-			(((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		max_global_size[1] =
-			(((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		max_num_work_pools =
-			(max_global_size[0] * max_global_size[1]) /
-			(SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
-		tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
-#endif
-
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
-		tile_specific_mem_allocated +=
-			user_set_tile_w * user_set_tile_h * sizeof(RNG);
-
-		return tile_specific_mem_allocated;
-	}
-
-	/* Calculates the texture memories and KernelData (d_data) memory
-	 * that has been allocated.
-	 */
-	size_t get_scene_specific_mem_allocated(cl_mem d_data)
-	{
-		size_t scene_specific_mem_allocated = 0;
-		/* Calculate texture memories. */
-#define KERNEL_TEX(type, ttype, name) \
-	scene_specific_mem_allocated += get_tex_size(#name);
-#include "kernel_textures.h"
-#undef KERNEL_TEX
-		size_t d_data_size;
-		ciErr = clGetMemObjectInfo(d_data,
-		                           CL_MEM_SIZE,
-		                           sizeof(d_data_size),
-		                           &d_data_size,
-		                           NULL);
-		assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
-		scene_specific_mem_allocated += d_data_size;
-		return scene_specific_mem_allocated;
-	}
-
-	/* Calculate the memory required for one thread in split kernel. */
-	size_t get_per_thread_memory()
-	{
-		size_t shaderdata_size = 0;
-		/* TODO(sergey): This will actually over-allocate if
-		 * particular kernel does not support multiclosure.
-		 */
-		shaderdata_size = get_shader_data_size(current_max_closure);
-		size_t retval = sizeof(RNG)
-			+ sizeof(float3)          /* Throughput size */
-			+ sizeof(float)           /* L transparent size */
-			+ sizeof(char)            /* Ray state size */
-			+ sizeof(unsigned int)    /* Work element size */
-			+ sizeof(int)             /* ISLamp_size */
-			+ sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
-			+ sizeof(Intersection)    /* Overall isect */
-			+ sizeof(Intersection)    /* Instersection_coop_AO */
-			+ sizeof(Intersection)    /* Intersection coop DL */
-			+ shaderdata_size         /* Overall ShaderData */
-			+ (shaderdata_size * 2)   /* ShaderData : DL and shadow */
-			+ sizeof(Ray) + sizeof(BsdfEval)
-			+ sizeof(float3)          /* AOAlpha size */
-			+ sizeof(float3)          /* AOBSDF size */
-			+ sizeof(Ray)
-			+ (sizeof(int) * NUM_QUEUES)
-			+ per_thread_output_buffer_size;
-		return retval;
-	}
-
-	/* Considers the total memory available in the device and
-	 * and returns the maximum global work size possible.
-	 */
-	size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
-	{
-		/* Calculate invariably allocated memory. */
-		size_t invariable_mem_allocated = get_invariable_mem_allocated();
-		/* Calculate tile specific allocated memory. */
-		size_t tile_specific_mem_allocated =
-			get_tile_specific_mem_allocated(tile_size);
-		/* Calculate scene specific allocated memory. */
-		size_t scene_specific_mem_allocated =
-			get_scene_specific_mem_allocated(d_data);
-		/* Calculate total memory available for the threads in global work size. */
-		size_t available_memory = total_allocatable_memory
-			- invariable_mem_allocated
-			- tile_specific_mem_allocated
-			- scene_specific_mem_allocated
-			- DATA_ALLOCATION_MEM_FACTOR;
-		size_t per_thread_memory_required = get_per_thread_memory();
-		return (available_memory / per_thread_memory_required);
-	}
-
-	/* Checks if the device has enough memory to render the whole tile;
-	 * If not, we should split single tile into multiple tiles of small size
-	 * and process them all.
-	 */
-	bool need_to_split_tile(unsigned int d_w,
-	                        unsigned int d_h,
-	                        int2 max_render_feasible_tile_size)
-	{
-		size_t global_size_estimate[2];
-		/* TODO(sergey): Such round-ups are in quite few places, need to replace
-		 * them with an utility macro.
-		 */
-		global_size_estimate[0] =
-			(((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		global_size_estimate[1] =
-			(((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if((global_size_estimate[0] * global_size_estimate[1]) >
-		   (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
-		{
-			return true;
-		}
-		else {
+		device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+		                               device->program_data_init(),
+		                               2,
+		                               NULL,
+		                               dim.global_size,
+		                               dim.local_size,
+		                               0,
+		                               NULL,
+		                               NULL);
+
+		device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+		if(device->ciErr != CL_SUCCESS) {
+			string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+			                               clewErrorString(device->ciErr));
+			device->opencl_error(message);
 			return false;
 		}
-	}
 
-	/* Considers the scene properties, global memory available in the device
-	 * and returns a rectanglular tile dimension (approx the maximum)
-	 * that should render on split kernel.
-	 */
-	int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
-	{
-		int2 max_render_feasible_tile_size;
-		int square_root_val = (int)sqrt(feasible_global_work_size);
-		max_render_feasible_tile_size.x = square_root_val;
-		max_render_feasible_tile_size.y = square_root_val;
-		/* Ciel round-off max_render_feasible_tile_size. */
-		int2 ceil_render_feasible_tile_size;
-		ceil_render_feasible_tile_size.x =
-			(((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		ceil_render_feasible_tile_size.y =
-			(((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
-		   feasible_global_work_size)
-		{
-			return ceil_render_feasible_tile_size;
-		}
-		/* Floor round-off max_render_feasible_tile_size. */
-		int2 floor_render_feasible_tile_size;
-		floor_render_feasible_tile_size.x =
-			(max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		floor_render_feasible_tile_size.y =
-			(max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		return floor_render_feasible_tile_size;
+		return true;
 	}
 
-	/* Try splitting the current tile into multiple smaller
-	 * almost-square-tiles.
-	 */
-	int2 get_split_tile_size(RenderTile rtile,
-	                         int2 max_render_feasible_tile_size)
+	virtual int2 split_kernel_local_size()
 	{
-		int2 split_tile_size;
-		int num_global_threads = max_render_feasible_tile_size.x *
-		                         max_render_feasible_tile_size.y;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		/* Ceil round off d_w and d_h */
-		d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_X;
-		d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-			SPLIT_KERNEL_LOCAL_SIZE_Y;
-		while(d_w * d_h > num_global_threads) {
-			/* Halve the longer dimension. */
-			if(d_w >= d_h) {
-				d_w = d_w / 2;
-				d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_X;
-			}
-			else {
-				d_h = d_h / 2;
-				d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-					SPLIT_KERNEL_LOCAL_SIZE_Y;
-			}
-		}
-		split_tile_size.x = d_w;
-		split_tile_size.y = d_h;
-		return split_tile_size;
+		return make_int2(64, 1);
 	}
 
-	/* Splits existing tile into multiple tiles of tile size split_tile_size. */
-	vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
+	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask */*task*/)
 	{
-		vector<SplitRenderTile> to_path_trace_rtile;
-		int d_w = rtile.w;
-		int d_h = rtile.h;
-		int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
-		int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
-		/* Buffer and rng_state offset calc. */
-		size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
-		size_t offset_x = offset_index % rtile.stride;
-		size_t offset_y = offset_index / rtile.stride;
-		/* Resize to_path_trace_rtile. */
-		to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
-		for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
-			for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
-				int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
-				to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
-				to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
-				to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
-				to_path_trace_rtile[rtile_index].sample = rtile.sample;
-				to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
-				to_path_trace_rtile[rtile_index].offset = rtile.offset;
-				to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
-				to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
-				to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
-				to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
-				to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
-				to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
-				/* Fill width and height of the new render tile. */
-				to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
-					(d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
-					: split_tile_size.x;
-				to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
-					(d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
-					: split_tile_size.y;
-				to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
-			}
+		cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
+		/* Use small global size on CPU devices as it seems to be much faster. */
+		if(type == CL_DEVICE_TYPE_CPU) {
+			VLOG(1) << "Global size: (64, 64).";
+			return make_int2(64, 64);
 		}
-		return to_path_trace_rtile;
-	}
 
-	void thread_run(DeviceTask *task)
-	{
-		if(task->type == DeviceTask::FILM_CONVERT) {
-			film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
-		}
-		else if(task->type == DeviceTask::SHADER) {
-			shader(*task);
-		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
-			RenderTile tile;
-			bool initialize_data_and_check_render_feasibility = false;
-			bool need_to_split_tiles_further = false;
-			int2 max_render_feasible_tile_size;
-			size_t feasible_global_work_size;
-			const int2 tile_size = task->requested_tile_size;
-			/* Keep rendering tiles until done. */
-			while(task->acquire_tile(this, tile)) {
-				if(!initialize_data_and_check_render_feasibility) {
-					/* Initialize data. */
-					/* Calculate per_thread_output_buffer_size. */
-					size_t output_buffer_size = 0;
-					ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
-					                           CL_MEM_SIZE,
-					                           sizeof(output_buffer_size),
-					                           &output_buffer_size,
-					                           NULL);
-					assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
-					/* This value is different when running on AMD and NV. */
-					if(background) {
-						/* In offline render the number of buffer elements
-						 * associated with tile.buffer is the current tile size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.w * tile.h);
-					}
-					else {
-						/* interactive rendering, unlike offline render, the number of buffer elements
-						 * associated with tile.buffer is the entire viewport size.
-						 */
-						per_thread_output_buffer_size =
-							output_buffer_size / (tile.buffers->params.width *
-							                      tile.buffers->params.height);
-					}
-					/* Check render feasibility. */
-					feasible_global_work_size = get_feasible_global_work_size(
-						tile_size,
-						CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
-					max_render_feasible_tile_size =
-						get_max_render_feasible_tile_size(
-							feasible_global_work_size);
-					need_to_split_tiles_further =
-						need_to_split_tile(tile_size.x,
-						                   tile_size.y,
-						                   max_render_feasible_tile_size);
-					initialize_data_and_check_render_feasibility = true;
-				}
-				if(need_to_split_tiles_further) {
-					int2 split_tile_size =
-						get_split_tile_size(tile,
-						                    max_render_feasible_tile_size);
-					vector<SplitRenderTile> to_path_trace_render_tiles =
-						split_tiles(tile, split_tile_size);
-					/* Print message to console */
-					if(background && (to_path_trace_render_tiles.size() > 1)) {
-						fprintf(stderr, "Message : Tiles need to be split "
-						        "further inside path trace (due to insufficient "
-						        "device-global-memory for split kernel to "
-						        "function) \n"
-						        "The current tile of dimensions %dx%d is split "
-						        "into tiles of dimension %dx%d for render \n",
-						        tile.w, tile.h,
-						        split_tile_size.x,
-						        split_tile_size.y);
-					}
-					/* Process all split tiles. */
-					for(int tile_iter = 0;
-					    tile_iter < to_path_trace_render_tiles.size();
-					    ++tile_iter)
-					{
-						path_trace(task,
-						           to_path_trace_render_tiles[tile_iter],
-						           max_render_feasible_tile_size);
-					}
-				}
-				else {
-					/* No splitting required; process the entire tile at once. */
-					/* Render feasible tile size is user-set-tile-size itself. */
-					max_render_feasible_tile_size.x =
-						(((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_X;
-					max_render_feasible_tile_size.y =
-						(((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
-						SPLIT_KERNEL_LOCAL_SIZE_Y;
-					/* buffer_rng_state_stride is stride itself. */
-					SplitRenderTile split_tile(tile);
-					split_tile.buffer_rng_state_stride = tile.stride;
-					path_trace(task, split_tile, max_render_feasible_tile_size);
-				}
-				tile.sample = tile.start_sample + tile.num_samples;
+		cl_ulong max_buffer_size;
+		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+		VLOG(1) << "Maximum device allocation size: "
+		        << string_human_readable_number(max_buffer_size) << " bytes. ("
+		        << string_human_readable_size(max_buffer_size) << ").";
 
-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-
-				task->release_tile(tile);
-			}
-		}
-	}
-
-protected:
-	cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
-	{
-		cl_mem ptr;
-		assert(bufsize != 0);
-		ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
-		opencl_assert_err(ciErr, "clCreateBuffer");
-		return ptr;
+		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2);
+		int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements));
+		VLOG(1) << "Global size: " << global_size << ".";
+		return global_size;
 	}
+};
 
-	/* ** Those guys are for workign around some compiler-specific bugs ** */
+OpenCLDeviceSplitKernel::OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+: OpenCLDeviceBase(info, stats, background_)
+{
+	split_kernel = new OpenCLSplitKernel(this);
 
-	string build_options_for_base_program(
-	        const DeviceRequestedFeatures& requested_features)
-	{
-		return requested_features.get_build_options();
-	}
-};
+	background = background_;
+}
 
 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, bool background)
 {
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 82e1640e508..8128fcee09b 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,11 +16,12 @@
 
 #ifdef WITH_OPENCL
 
-#include "opencl.h"
+#include "device/opencl/opencl.h"
 
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_time.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_time.h"
 
 using std::cerr;
 using std::endl;
@@ -234,7 +235,7 @@ string OpenCLCache::get_kernel_md5()
 	thread_scoped_lock lock(self.kernel_md5_lock);
 
 	if(self.kernel_md5.empty()) {
-		self.kernel_md5 = path_files_md5_hash(path_get("kernel"));
+		self.kernel_md5 = path_files_md5_hash(path_get("source"));
 	}
 	return self.kernel_md5;
 }
@@ -309,6 +310,8 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 	string build_options;
 	build_options = device->kernel_build_options(debug_src) + kernel_build_options;
 
+	VLOG(1) << "Build options passed to clBuildProgram: '"
+	        << build_options << "'.";
 	cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
 
 	/* show warnings even if build is successful */
@@ -336,12 +339,13 @@ bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src)
 
 bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 {
-	string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n";
+	string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
 	/* We compile kernels consisting of many files. unfortunately OpenCL
 	 * kernel caches do not seem to recognize changes in included files.
 	 * so we force recompile on changes by adding the md5 hash of all files.
 	 */
-	source = path_source_replace_includes(source, path_get("kernel"));
+	source = path_source_replace_includes(source, path_get("source"));
+	source += "\n// " + util_md5_string(source) + "\n";
 
 	if(debug_src) {
 		path_write_text(*debug_src, source);
@@ -352,10 +356,10 @@ bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src)
 	cl_int ciErr;
 
 	program = clCreateProgramWithSource(device->cxContext,
-	                                   1,
-	                                   &source_str,
-	                                   &source_len,
-	                                   &ciErr);
+	                                    1,
+	                                    &source_str,
+	                                    &source_len,
+	                                    &ciErr);
 
 	if(ciErr != CL_SUCCESS) {
 		add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
@@ -438,7 +442,11 @@ void OpenCLDeviceBase::OpenCLProgram::load()
 	if(!program) {
 		add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
 
-		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5();
+		/* need to create source to get md5 */
+		string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
+		source = path_source_replace_includes(source, path_get("source"));
+
+		string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
 		basename = path_cache_get(path_join("kernels", basename));
 		string clbin = basename + ".clbin";
 
@@ -544,6 +552,11 @@ bool OpenCLInfo::use_debug()
 	return DebugFlags().opencl.debug;
 }
 
+bool OpenCLInfo::use_single_program()
+{
+	return DebugFlags().opencl.single_program;
+}
+
 bool OpenCLInfo::kernel_use_advanced_shading(const string& platform)
 {
 	/* keep this in sync with kernel_types.h! */
@@ -587,11 +600,20 @@ bool OpenCLInfo::device_supported(const string& platform_name,
                                   const cl_device_id device_id)
 {
 	cl_device_type device_type;
-	clGetDeviceInfo(device_id,
-	                CL_DEVICE_TYPE,
-	                sizeof(cl_device_type),
-	                &device_type,
-	                NULL);
+	if(!get_device_type(device_id, &device_type)) {
+		return false;
+	}
+	string device_name;
+	if(!get_device_name(device_id, &device_name)) {
+		return false;
+	}
+	/* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
+	 * (aka, it will not be on Intel framework). This isn't supported
+	 * and needs an explicit blacklist.
+	 */
+	if(strstr(device_name.c_str(), "Iris")) {
+		return false;
+	}
 	if(platform_name == "AMD Accelerated Parallel Processing" &&
 	   device_type == CL_DEVICE_TYPE_GPU)
 	{
@@ -705,39 +727,30 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 		return;
 	}
 
+	cl_int error;
 	vector<cl_device_id> device_ids;
-	cl_uint num_devices = 0;
 	vector<cl_platform_id> platform_ids;
-	cl_uint num_platforms = 0;
 
-	/* Get devices. */
-	if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
-	   num_platforms == 0)
-	{
-		FIRST_VLOG(2) << "No OpenCL platforms were found.";
+	/* Get platforms. */
+	if(!get_platforms(&platform_ids, &error)) {
+		FIRST_VLOG(2) << "Error fetching platforms:"
+		              << string(clewErrorString(error));
 		first_time = false;
 		return;
 	}
-	platform_ids.resize(num_platforms);
-	if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
-		FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
+	if(platform_ids.size() == 0) {
+		FIRST_VLOG(2) << "No OpenCL platforms were found.";
 		first_time = false;
 		return;
 	}
 	/* Devices are numbered consecutively across platforms. */
-	for(int platform = 0; platform < num_platforms; platform++) {
+	for(int platform = 0; platform < platform_ids.size(); platform++) {
 		cl_platform_id platform_id = platform_ids[platform];
-		char pname[256];
-		if(clGetPlatformInfo(platform_id,
-		                     CL_PLATFORM_NAME,
-		                     sizeof(pname),
-		                     &pname,
-		                     NULL) != CL_SUCCESS)
-		{
+		string platform_name;
+		if(!get_platform_name(platform_id, &platform_name)) {
 			FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
 			continue;
 		}
-		string platform_name = pname;
 		FIRST_VLOG(2) << "Enumerating devices for platform "
 		              << platform_name << ".";
 		if(!platform_version_check(platform_id)) {
@@ -745,39 +758,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 			              << " due to too old compiler version.";
 			continue;
 		}
-		num_devices = 0;
-		cl_int ciErr;
-		if((ciErr = clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  0,
-		                  NULL,
-		                  &num_devices)) != CL_SUCCESS || num_devices == 0)
+		if(!get_platform_devices(platform_id,
+		                         device_type,
+		                         &device_ids,
+		                         &error))
 		{
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch number of devices: " << string(clewErrorString(ciErr));
+			              << ", failed to fetch of devices: "
+			              << string(clewErrorString(error));
 			continue;
 		}
-		device_ids.resize(num_devices);
-		if(clGetDeviceIDs(platform_id,
-		                  device_type,
-		                  num_devices,
-		                  &device_ids[0],
-		                  NULL) != CL_SUCCESS)
-		{
+		if(device_ids.size() == 0) {
 			FIRST_VLOG(2) << "Ignoring platform " << platform_name
-			              << ", failed to fetch devices list.";
+			              << ", it has no devices.";
 			continue;
 		}
-		for(int num = 0; num < num_devices; num++) {
-			cl_device_id device_id = device_ids[num];
-			char device_name[1024] = "\0";
-			if(clGetDeviceInfo(device_id,
-			                   CL_DEVICE_NAME,
-			                   sizeof(device_name),
-			                   &device_name,
-			                   NULL) != CL_SUCCESS)
-			{
-				FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
+		for(int num = 0; num < device_ids.size(); num++) {
+			const cl_device_id device_id = device_ids[num];
+			string device_name;
+			if(!get_device_name(device_id, &device_name, &error)) {
+				FIRST_VLOG(2) << "Failed to fetch device name: "
+				              << string(clewErrorString(error))
+				              << ", ignoring.";
 				continue;
 			}
 			if(!device_version_check(device_id)) {
@@ -789,24 +791,28 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 			   device_supported(platform_name, device_id))
 			{
 				cl_device_type device_type;
-				if(clGetDeviceInfo(device_id,
-				                   CL_DEVICE_TYPE,
-				                   sizeof(cl_device_type),
-				                   &device_type,
-				                   NULL) != CL_SUCCESS)
-				{
+				if(!get_device_type(device_id, &device_type, &error)) {
 					FIRST_VLOG(2) << "Ignoring device " << device_name
-					              << ", failed to fetch device type.";
+					              << ", failed to fetch device type:"
+					              << string(clewErrorString(error));
 					continue;
 				}
-				FIRST_VLOG(2) << "Adding new device " << device_name << ".";
+				string readable_device_name =
+				        get_readable_device_name(device_id);
+				if(readable_device_name != device_name) {
+					FIRST_VLOG(2) << "Using more readable device name: "
+					              << readable_device_name;
+				}
+				FIRST_VLOG(2) << "Adding new device "
+				              << readable_device_name << ".";
 				string hardware_id = get_hardware_id(platform_name, device_id);
-				usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-				                                               platform_name,
-				                                               device_id,
-				                                               device_type,
-				                                               device_name,
-				                                               hardware_id));
+				usable_devices->push_back(OpenCLPlatformDevice(
+				        platform_id,
+				        platform_name,
+				        device_id,
+				        device_type,
+				        readable_device_name,
+				        hardware_id));
 			}
 			else {
 				FIRST_VLOG(2) << "Ignoring device " << device_name
@@ -817,6 +823,252 @@ void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices
 	first_time = false;
 }
 
+bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids,
+                               cl_int *error)
+{
+	/* Reset from possible previous state. */
+	platform_ids->resize(0);
+	cl_uint num_platforms;
+	if(!get_num_platforms(&num_platforms, error)) {
+		return false;
+	}
+	/* Get actual platforms. */
+	cl_int err;
+	platform_ids->resize(num_platforms);
+	if((err = clGetPlatformIDs(num_platforms,
+	                           &platform_ids->at(0),
+	                           NULL)) != CL_SUCCESS) {
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+vector<cl_platform_id> OpenCLInfo::get_platforms()
+{
+	vector<cl_platform_id> platform_ids;
+	get_platforms(&platform_ids);
+	return platform_ids;
+}
+
+bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
+{
+	cl_int err;
+	if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
+		if(error != NULL) {
+			*error = err;
+		}
+		*num_platforms = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_uint OpenCLInfo::get_num_platforms()
+{
+	cl_uint num_platforms;
+	if(!get_num_platforms(&num_platforms)) {
+		return 0;
+	}
+	return num_platforms;
+}
+
+bool OpenCLInfo::get_platform_name(cl_platform_id platform_id,
+                                   string *platform_name)
+{
+	char buffer[256];
+	if(clGetPlatformInfo(platform_id,
+	                     CL_PLATFORM_NAME,
+	                     sizeof(buffer),
+	                     &buffer,
+	                     NULL) != CL_SUCCESS)
+	{
+		*platform_name = "";
+		return false;
+	}
+	*platform_name = buffer;
+	return true;
+}
+
+string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
+{
+	string platform_name;
+	if (!get_platform_name(platform_id, &platform_name)) {
+		return "";
+	}
+	return platform_name;
+}
+
+bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
+                                          cl_device_type device_type,
+                                          cl_uint *num_devices,
+                                          cl_int *error)
+{
+	cl_int err;
+	if((err = clGetDeviceIDs(platform_id,
+	                         device_type,
+	                         0,
+	                         NULL,
+	                         num_devices)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*num_devices = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
+                                             cl_device_type device_type)
+{
+	cl_uint num_devices;
+	if(!get_num_platform_devices(platform_id,
+	                             device_type,
+	                             &num_devices))
+	{
+		return 0;
+	}
+	return num_devices;
+}
+
+bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
+                                      cl_device_type device_type,
+                                      vector<cl_device_id> *device_ids,
+                                      cl_int* error)
+{
+	/* Reset from possible previous state. */
+	device_ids->resize(0);
+	/* Get number of devices to pre-allocate memory. */
+	cl_uint num_devices;
+	if(!get_num_platform_devices(platform_id,
+	                             device_type,
+	                             &num_devices,
+	                             error))
+	{
+		return false;
+	}
+	/* Get actual device list. */
+	device_ids->resize(num_devices);
+	cl_int err;
+	if((err = clGetDeviceIDs(platform_id,
+	                         device_type,
+	                         num_devices,
+	                         &device_ids->at(0),
+	                         NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
+                                                      cl_device_type device_type)
+{
+	vector<cl_device_id> devices;
+	get_platform_devices(platform_id, device_type, &devices);
+	return devices;
+}
+
+bool OpenCLInfo::get_device_name(cl_device_id device_id,
+                                 string *device_name,
+                                 cl_int* error)
+{
+	char buffer[1024];
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DEVICE_NAME,
+	                          sizeof(buffer),
+	                          &buffer,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*device_name = "";
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	*device_name = buffer;
+	return true;
+}
+
+string OpenCLInfo::get_device_name(cl_device_id device_id)
+{
+	string device_name;
+	if(!get_device_name(device_id, &device_name)) {
+		return "";
+	}
+	return device_name;
+}
+
+bool OpenCLInfo::get_device_type(cl_device_id device_id,
+                                 cl_device_type *device_type,
+                                 cl_int* error)
+{
+	cl_int err;
+	if((err = clGetDeviceInfo(device_id,
+	                          CL_DEVICE_TYPE,
+	                          sizeof(cl_device_type),
+	                          device_type,
+	                          NULL)) != CL_SUCCESS)
+	{
+		if(error != NULL) {
+			*error = err;
+		}
+		*device_type = 0;
+		return false;
+	}
+	if(error != NULL) {
+		*error = CL_SUCCESS;
+	}
+	return true;
+}
+
+cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
+{
+	cl_device_type device_type;
+	if(!get_device_type(device_id, &device_type)) {
+		return 0;
+	}
+	return device_type;
+}
+
+string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
+{
+	char board_name[1024];
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_BOARD_NAME_AMD,
+	                   sizeof(board_name),
+	                   &board_name,
+	                   NULL) == CL_SUCCESS)
+	{
+		return board_name;
+	}
+	/* Fallback to standard device name API. */
+	return get_device_name(device_id);
+}
+
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/graph/CMakeLists.txt b/intern/cycles/graph/CMakeLists.txt
index 4ea18728f1c..e70a18137bd 100644
--- a/intern/cycles/graph/CMakeLists.txt
+++ b/intern/cycles/graph/CMakeLists.txt
@@ -1,7 +1,6 @@
 
 set(INC
-	.
-	../util
+	..
 )
 
 set(SRC
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index 3c228a716d5..10d91a1e4ef 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "node.h"
-#include "node_type.h"
+#include "graph/node.h"
+#include "graph/node_type.h"
 
-#include "util_foreach.h"
-#include "util_param.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index 64410f4539b..53425f5faf1 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "node_type.h"
+#include "graph/node_type.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_enum.h b/intern/cycles/graph/node_enum.h
index 2bae531c036..4e40c294f4f 100644
--- a/intern/cycles/graph/node_enum.h
+++ b/intern/cycles/graph/node_enum.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "util_map.h"
-#include "util_param.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_type.cpp b/intern/cycles/graph/node_type.cpp
index 5b98de778ad..a3a8fa5f382 100644
--- a/intern/cycles/graph/node_type.cpp
+++ b/intern/cycles/graph/node_type.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "node_type.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "graph/node_type.h"
+#include "util/util_foreach.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_type.h b/intern/cycles/graph/node_type.h
index 1fb135f6d22..7d46e31ce24 100644
--- a/intern/cycles/graph/node_type.h
+++ b/intern/cycles/graph/node_type.h
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include "node_enum.h"
+#include "graph/node_enum.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_xml.cpp b/intern/cycles/graph/node_xml.cpp
index 590e09645ed..aad2740ffc0 100644
--- a/intern/cycles/graph/node_xml.cpp
+++ b/intern/cycles/graph/node_xml.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "node_xml.h"
+#include "graph/node_xml.h"
 
-#include "util_foreach.h"
-#include "util_string.h"
-#include "util_transform.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/graph/node_xml.h b/intern/cycles/graph/node_xml.h
index 7494c5e6e55..63e80bf79f2 100644
--- a/intern/cycles/graph/node_xml.h
+++ b/intern/cycles/graph/node_xml.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_map.h"
-#include "util_string.h"
-#include "util_xml.h"
+#include "util/util_map.h"
+#include "util/util_string.h"
+#include "util/util_xml.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 5f3ceb0f864..c3772dfa2d8 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -1,10 +1,7 @@
 remove_extra_strict_flags()
 
 set(INC
-	.
-	../util
-	osl
-	svm
+	..
 )
 
 set(INC_SYS
@@ -13,19 +10,28 @@ set(INC_SYS
 
 set(SRC
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_split.cpp
 	kernels/opencl/kernel.cl
+	kernels/opencl/kernel_state_buffer_size.cl
+	kernels/opencl/kernel_split.cl
 	kernels/opencl/kernel_data_init.cl
+	kernels/opencl/kernel_path_init.cl
 	kernels/opencl/kernel_queue_enqueue.cl
 	kernels/opencl/kernel_scene_intersect.cl
 	kernels/opencl/kernel_lamp_emission.cl
-	kernels/opencl/kernel_background_buffer_update.cl
+	kernels/opencl/kernel_do_volume.cl
+	kernels/opencl/kernel_indirect_background.cl
 	kernels/opencl/kernel_shader_eval.cl
 	kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+	kernels/opencl/kernel_subsurface_scatter.cl
 	kernels/opencl/kernel_direct_lighting.cl
-	kernels/opencl/kernel_shadow_blocked.cl
+	kernels/opencl/kernel_shadow_blocked_ao.cl
+	kernels/opencl/kernel_shadow_blocked_dl.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
-	kernels/opencl/kernel_sum_all_radiance.cl
+	kernels/opencl/kernel_indirect_subsurface.cl
+	kernels/opencl/kernel_buffer_update.cl
 	kernels/cuda/kernel.cu
+	kernels/cuda/kernel_split.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -68,6 +74,7 @@ set(SRC_HEADERS
 	kernel_path_common.h
 	kernel_path_state.h
 	kernel_path_surface.h
+	kernel_path_subsurface.h
 	kernel_path_volume.h
 	kernel_projection.h
 	kernel_queues.h
@@ -88,6 +95,10 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu_image.h
 )
 
+set(SRC_KERNELS_CUDA_HEADERS
+	kernels/cuda/kernel_config.h
+)
+
 set(SRC_CLOSURE_HEADERS
 	closure/alloc.h
 	closure/bsdf.h
@@ -184,6 +195,7 @@ set(SRC_UTIL_HEADERS
 	../util/util_hash.h
 	../util/util_math.h
 	../util/util_math_fast.h
+	../util/util_math_intersect.h
 	../util/util_static_assert.h
 	../util/util_transform.h
 	../util/util_texture.h
@@ -191,17 +203,25 @@ set(SRC_UTIL_HEADERS
 )
 
 set(SRC_SPLIT_HEADERS
-	split/kernel_background_buffer_update.h
+	split/kernel_buffer_update.h
 	split/kernel_data_init.h
 	split/kernel_direct_lighting.h
+	split/kernel_do_volume.h
 	split/kernel_holdout_emission_blurring_pathtermination_ao.h
+	split/kernel_indirect_background.h
+	split/kernel_indirect_subsurface.h
 	split/kernel_lamp_emission.h
 	split/kernel_next_iteration_setup.h
+	split/kernel_path_init.h
+	split/kernel_queue_enqueue.h
 	split/kernel_scene_intersect.h
 	split/kernel_shader_eval.h
-	split/kernel_shadow_blocked.h
+	split/kernel_shadow_blocked_ao.h
+	split/kernel_shadow_blocked_dl.h
 	split/kernel_split_common.h
-	split/kernel_sum_all_radiance.h
+	split/kernel_split_data.h
+	split/kernel_split_data_types.h
+	split/kernel_subsurface_scatter.h
 )
 
 # CUDA module
@@ -229,8 +249,9 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	endif()
 
 	# build for each arch
-	set(cuda_sources kernels/cuda/kernel.cu
+	set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
 		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
 		${SRC_BVH_HEADERS}
 		${SRC_SVM_HEADERS}
 		${SRC_GEOM_HEADERS}
@@ -239,15 +260,22 @@ if(WITH_CYCLES_CUDA_BINARIES)
 	)
 	set(cuda_cubins)
 
-	macro(CYCLES_CUDA_KERNEL_ADD arch experimental)
-		if(${experimental})
-			set(cuda_extra_flags "-D__KERNEL_EXPERIMENTAL__")
-			set(cuda_cubin kernel_experimental_${arch}.cubin)
+	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
+		if(${split})
+			set(cuda_extra_flags "-D__SPLIT__")
+			set(cuda_cubin kernel_split)
 		else()
 			set(cuda_extra_flags "")
-			set(cuda_cubin kernel_${arch}.cubin)
+			set(cuda_cubin kernel)
 		endif()
 
+		if(${experimental})
+			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
+			set(cuda_cubin ${cuda_cubin}_experimental)
+		endif()
+
+		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+
 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
 		else()
@@ -260,13 +288,19 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")
 
+		if(split)
+			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
+		else()
+			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
+		endif()
+
 		add_custom_command(
 			OUTPUT ${cuda_cubin}
 			COMMAND ${cuda_nvcc_command}
 					-arch=${arch}
 					${CUDA_NVCC_FLAGS}
 					-m${CUDA_BITS}
-					--cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda/kernel.cu
+					--cubin ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
 					-o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}
 					--ptxas-options="-v"
 					${cuda_arch_flags}
@@ -274,8 +308,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 					${cuda_math_flags}
 					${cuda_extra_flags}
 					${cuda_debug_flags}
-					-I${CMAKE_CURRENT_SOURCE_DIR}/../util
-					-I${CMAKE_CURRENT_SOURCE_DIR}/svm
+					-I${CMAKE_CURRENT_SOURCE_DIR}/..
 					-DCCL_NAMESPACE_BEGIN=
 					-DCCL_NAMESPACE_END=
 					-DNVCC
@@ -293,7 +326,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
 
 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+
+		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
+			# Compile split kernel
+			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+		endif()
 	endforeach()
 
 	add_custom_target(cycles_kernel_cuda ALL DEPENDS ${cuda_cubins})
@@ -311,36 +349,50 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
+set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+
 if(CXX_HAS_SSE)
 	list(APPEND SRC
 		kernels/cpu/kernel_sse2.cpp
 		kernels/cpu/kernel_sse3.cpp
 		kernels/cpu/kernel_sse41.cpp
+		kernels/cpu/kernel_split_sse2.cpp
+		kernels/cpu/kernel_split_sse3.cpp
+		kernels/cpu/kernel_split_sse41.cpp
 	)
 
 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx.cpp
+		kernels/cpu/kernel_split_avx.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
 	list(APPEND SRC
 		kernels/cpu/kernel_avx2.cpp
+		kernels/cpu/kernel_split_avx2.cpp
 	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 add_library(cycles_kernel
 	${SRC}
 	${SRC_HEADERS}
 	${SRC_KERNELS_CPU_HEADERS}
+	${SRC_KERNELS_CUDA_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
 	${SRC_SVM_HEADERS}
@@ -362,24 +414,33 @@ endif()
 #add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/bvh)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/svm)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/geom)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/split)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_state_buffer_size.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_split.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_path_init.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_do_volume.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_background.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_subsurface_scatter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_ao.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked_dl.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
 
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 36798982653..85741016b25 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -27,43 +27,43 @@
 
 CCL_NAMESPACE_BEGIN
 
-#include "bvh_types.h"
+#include "kernel/bvh/bvh_types.h"
 
 /* Common QBVH functions. */
 #ifdef __QBVH__
-#  include "qbvh_nodes.h"
+#  include "kernel/bvh/qbvh_nodes.h"
 #endif
 
 /* Regular BVH traversal */
 
-#include "bvh_nodes.h"
+#include "kernel/bvh/bvh_nodes.h"
 
 #define BVH_FUNCTION_NAME bvh_intersect
 #define BVH_FUNCTION_FEATURES 0
-#include "bvh_traversal.h"
+#include "kernel/bvh/bvh_traversal.h"
 
 #if defined(__INSTANCING__)
 #  define BVH_FUNCTION_NAME bvh_intersect_instancing
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 #if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #  define BVH_FUNCTION_NAME bvh_intersect_hair_motion
 #  define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_HAIR_MINIMUM_WIDTH|BVH_MOTION
-#  include "bvh_traversal.h"
+#  include "kernel/bvh/bvh_traversal.h"
 #endif
 
 /* Subsurface scattering BVH traversal */
@@ -71,12 +71,12 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SUBSURFACE__)
 #  define BVH_FUNCTION_NAME bvh_intersect_subsurface
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_subsurface.h"
+#  include "kernel/bvh/bvh_subsurface.h"
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_subsurface_motion
 #    define BVH_FUNCTION_FEATURES BVH_MOTION|BVH_HAIR
-#    include "bvh_subsurface.h"
+#    include "kernel/bvh/bvh_subsurface.h"
 #  endif
 #endif  /* __SUBSURFACE__ */
 
@@ -85,18 +85,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume.h"
+#  include "kernel/bvh/bvh_volume.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume.h"
+#    include "kernel/bvh/bvh_volume.h"
 #  endif
 #endif  /* __VOLUME__ */
 
@@ -105,30 +105,30 @@ CCL_NAMESPACE_BEGIN
 #if defined(__SHADOW_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #  define BVH_FUNCTION_FEATURES 0
-#  include "bvh_shadow_all.h"
+#  include "kernel/bvh/bvh_shadow_all.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__HAIR__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 
 #  if defined(__HAIR__) && defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all_hair_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR|BVH_MOTION
-#    include "bvh_shadow_all.h"
+#    include "kernel/bvh/bvh_shadow_all.h"
 #  endif
 #endif  /* __SHADOW_RECORD_ALL__ */
 
@@ -137,18 +137,18 @@ CCL_NAMESPACE_BEGIN
 #if defined(__VOLUME_RECORD_ALL__)
 #  define BVH_FUNCTION_NAME bvh_intersect_volume_all
 #  define BVH_FUNCTION_FEATURES BVH_HAIR
-#  include "bvh_volume_all.h"
+#  include "kernel/bvh/bvh_volume_all.h"
 
 #  if defined(__INSTANCING__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_instancing
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 
 #  if defined(__OBJECT_MOTION__)
 #    define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
 #    define BVH_FUNCTION_FEATURES BVH_INSTANCING|BVH_MOTION|BVH_HAIR
-#    include "bvh_volume_all.h"
+#    include "kernel/bvh/bvh_volume_all.h"
 #  endif
 #endif  /* __VOLUME_RECORD_ALL__ */
 
@@ -202,8 +202,9 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 }
 
 #ifdef __SUBSURFACE__
+/* Note: ray is passed by value to work around a possible CUDA compiler bug. */
 ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
-                                                     const Ray *ray,
+                                                     const Ray ray,
                                                      SubsurfaceIntersection *ss_isect,
                                                      int subsurface_object,
                                                      uint *lcg_state,
@@ -212,7 +213,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 #ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 		return bvh_intersect_subsurface_motion(kg,
-		                                       ray,
+		                                       &ray,
 		                                       ss_isect,
 		                                       subsurface_object,
 		                                       lcg_state,
@@ -220,7 +221,7 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 	}
 #endif /* __OBJECT_MOTION__ */
 	return bvh_intersect_subsurface(kg,
-	                                ray,
+	                                &ray,
 	                                ss_isect,
 	                                subsurface_object,
 	                                lcg_state,
@@ -229,30 +230,63 @@ ccl_device_intersect void scene_intersect_subsurface(KernelGlobals *kg,
 #endif
 
 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg, const Ray *ray, Intersection *isect, uint max_hits, uint *num_hits)
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+                                                     const Ray *ray,
+                                                     Intersection *isect,
+                                                     int skip_object,
+                                                     uint max_hits,
+                                                     uint *num_hits)
 {
 #  ifdef __OBJECT_MOTION__
 	if(kernel_data.bvh.have_motion) {
 #    ifdef __HAIR__
-		if(kernel_data.bvh.have_curves)
-			return bvh_intersect_shadow_all_hair_motion(kg, ray, isect, max_hits, num_hits);
+		if(kernel_data.bvh.have_curves) {
+			return bvh_intersect_shadow_all_hair_motion(kg,
+			                                            ray,
+			                                            isect,
+			                                            skip_object,
+			                                            max_hits,
+			                                            num_hits);
+		}
 #    endif /* __HAIR__ */
 
-		return bvh_intersect_shadow_all_motion(kg, ray, isect, max_hits, num_hits);
+		return bvh_intersect_shadow_all_motion(kg,
+		                                       ray,
+		                                       isect,
+		                                       skip_object,
+		                                       max_hits,
+		                                       num_hits);
 	}
 #  endif /* __OBJECT_MOTION__ */
 
 #  ifdef __HAIR__
-	if(kernel_data.bvh.have_curves)
-		return bvh_intersect_shadow_all_hair(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_curves) {
+		return bvh_intersect_shadow_all_hair(kg,
+		                                     ray,
+		                                     isect,
+		                                     skip_object,
+		                                     max_hits,
+		                                     num_hits);
+	}
 #  endif /* __HAIR__ */
 
 #  ifdef __INSTANCING__
-	if(kernel_data.bvh.have_instancing)
-		return bvh_intersect_shadow_all_instancing(kg, ray, isect, max_hits, num_hits);
+	if(kernel_data.bvh.have_instancing) {
+		return bvh_intersect_shadow_all_instancing(kg,
+		                                           ray,
+		                                           isect,
+		                                           skip_object,
+		                                           max_hits,
+		                                           num_hits);
+	}
 #  endif /* __INSTANCING__ */
 
-	return bvh_intersect_shadow_all(kg, ray, isect, max_hits, num_hits);
+	return bvh_intersect_shadow_all(kg,
+	                                ray,
+	                                isect,
+	                                skip_object,
+	                                max_hits,
+	                                num_hits);
 }
 #endif  /* __SHADOW_RECORD_ALL__ */
 
@@ -357,7 +391,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
+#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@@ -373,5 +407,28 @@ ccl_device int intersections_compare(const void *a, const void *b)
 }
 #endif
 
-CCL_NAMESPACE_END
+#if defined(__SHADOW_RECORD_ALL__)
+ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
+{
+#ifdef __KERNEL_GPU__
+	/* Use bubble sort which has more friendly memory pattern on GPU. */
+	bool swapped;
+	do {
+		swapped = false;
+		for(int j = 0; j < num_hits - 1; ++j) {
+			if(hits[j].t > hits[j + 1].t) {
+				struct Intersection tmp = hits[j];
+				hits[j] = hits[j + 1];
+				hits[j + 1] = tmp;
+				swapped = true;
+			}
+		}
+		--num_hits;
+	} while(swapped);
+#else
+	qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+#endif
+}
+#endif  /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 726bef1794c..74a9ebf14e4 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -17,8 +17,8 @@
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
 ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
-                                                           int node_addr,
-                                                           int child)
+                                                                int node_addr,
+                                                                int child)
 {
 	Transform space;
 	const int child_addr = node_addr + child * 3;
@@ -31,12 +31,12 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
 
 #if !defined(__KERNEL_SSE2__)
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
-                                                 const float3 P,
-                                                 const float3 idir,
-                                                 const float t,
-                                                 const int node_addr,
-                                                 const uint visibility,
-                                                 float dist[2])
+                                                      const float3 P,
+                                                      const float3 idir,
+                                                      const float t,
+                                                      const int node_addr,
+                                                      const uint visibility,
+                                                      float dist[2])
 {
 
 	/* fetch node data */
@@ -78,14 +78,14 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_aligned_node_intersect_robust(KernelGlobals *kg,
-                                                        const float3 P,
-                                                        const float3 idir,
-                                                        const float t,
-                                                        const float difl,
-                                                        const float extmax,
-                                                        const int node_addr,
-                                                        const uint visibility,
-                                                        float dist[2])
+                                                             const float3 P,
+                                                             const float3 idir,
+                                                             const float t,
+                                                             const float difl,
+                                                             const float extmax,
+                                                             const int node_addr,
+                                                             const uint visibility,
+                                                             float dist[2])
 {
 
 	/* fetch node data */
@@ -203,13 +203,13 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child_robust(
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const float3 idir,
-                                                   const float t,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const float3 idir,
+                                                        const float t,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -233,15 +233,15 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const float3 idir,
-                                                          const float t,
-                                                          const float difl,
-                                                          const float extmax,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const float3 idir,
+                                                               const float t,
+                                                               const float difl,
+                                                               const float extmax,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	int mask = 0;
 	float4 cnodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
@@ -265,13 +265,13 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3 P,
-                                         const float3 dir,
-                                         const float3 idir,
-                                         const float t,
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3 P,
+                                              const float3 dir,
+                                              const float3 idir,
+                                              const float t,
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -296,15 +296,15 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3 P,
-                                                const float3 dir,
-                                                const float3 idir,
-                                                const float t,
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3 P,
+                                                     const float3 dir,
+                                                     const float3 idir,
+                                                     const float t,
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -442,19 +442,19 @@ ccl_device_forceinline int bvh_aligned_node_intersect_robust(
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
-                                                   const float3 P,
-                                                   const float3 dir,
-                                                   const ssef& isect_near,
-                                                   const ssef& isect_far,
-                                                   const int node_addr,
-                                                   const uint visibility,
-                                                   float dist[2])
+                                                        const float3 P,
+                                                        const float3 dir,
+                                                        const ssef& isect_near,
+                                                        const ssef& isect_far,
+                                                        const int node_addr,
+                                                        const uint visibility,
+                                                        float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
 
 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -503,20 +503,20 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg,
-                                                          const float3 P,
-                                                          const float3 dir,
-                                                          const ssef& isect_near,
-                                                          const ssef& isect_far,
-                                                          const float difl,
-                                                          const int node_addr,
-                                                          const uint visibility,
-                                                          float dist[2])
+                                                               const float3 P,
+                                                               const float3 dir,
+                                                               const ssef& isect_near,
+                                                               const ssef& isect_far,
+                                                               const float difl,
+                                                               const int node_addr,
+                                                               const uint visibility,
+                                                               float dist[2])
 {
 	Transform space0 = bvh_unaligned_node_fetch_space(kg, node_addr, 0);
 	Transform space1 = bvh_unaligned_node_fetch_space(kg, node_addr, 1);
 
 	float3 aligned_dir0 = transform_direction(&space0, dir),
-	       aligned_dir1 = transform_direction(&space1, dir);;
+	       aligned_dir1 = transform_direction(&space1, dir);
 	float3 aligned_P0 = transform_point(&space0, P),
 	       aligned_P1 = transform_point(&space1, P);
 	float3 nrdir0 = -bvh_inverse_direction(aligned_dir0),
@@ -574,17 +574,17 @@ ccl_device_forceinline int bvh_unaligned_node_intersect_robust(KernelGlobals *kg
 }
 
 ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
-                                         const float3& P,
-                                         const float3& dir,
-                                         const ssef& isect_near,
-                                         const ssef& isect_far,
-                                         const ssef& tsplat,
-                                         const ssef Psplat[3],
-                                         const ssef idirsplat[3],
-                                         const shuffle_swap_t shufflexyz[3],
-                                         const int node_addr,
-                                         const uint visibility,
-                                         float dist[2])
+                                              const float3& P,
+                                              const float3& dir,
+                                              const ssef& isect_near,
+                                              const ssef& isect_far,
+                                              const ssef& tsplat,
+                                              const ssef Psplat[3],
+                                              const ssef idirsplat[3],
+                                              const shuffle_swap_t shufflexyz[3],
+                                              const int node_addr,
+                                              const uint visibility,
+                                              float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
@@ -612,19 +612,19 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
 }
 
 ccl_device_forceinline int bvh_node_intersect_robust(KernelGlobals *kg,
-                                                const float3& P,
-                                                const float3& dir,
-                                                const ssef& isect_near,
-                                                const ssef& isect_far,
-                                                const ssef& tsplat,
-                                                const ssef Psplat[3],
-                                                const ssef idirsplat[3],
-                                                const shuffle_swap_t shufflexyz[3],
-                                                const float difl,
-                                                const float extmax,
-                                                const int node_addr,
-                                                const uint visibility,
-                                                float dist[2])
+                                                     const float3& P,
+                                                     const float3& dir,
+                                                     const ssef& isect_near,
+                                                      const ssef& isect_far,
+                                                     const ssef& tsplat,
+                                                     const ssef Psplat[3],
+                                                     const ssef idirsplat[3],
+                                                     const shuffle_swap_t shufflexyz[3],
+                                                     const float difl,
+                                                     const float extmax,
+                                                     const int node_addr,
+                                                     const uint visibility,
+                                                     float dist[2])
 {
 	float4 node = kernel_tex_fetch(__bvh_nodes, node_addr);
 	if(__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index df33a86bb18..267e098f912 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_shadow_all.h"
+#  include "kernel/bvh/qbvh_shadow_all.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -45,6 +45,7 @@ ccl_device_inline
 bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
                                  const Ray *ray,
                                  Intersection *isect_array,
+                                 const int skip_object,
                                  const uint max_hits,
                                  uint *num_hits)
 {
@@ -100,9 +101,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif  /* __KERNEL_SSE2__ */
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -189,6 +187,16 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
 
+#ifdef __SHADOW_TRICKS__
+						uint tri_object = (object == OBJECT_NONE)
+						        ? kernel_tex_fetch(__prim_object, prim_addr)
+						        : object;
+						if(tri_object == skip_object) {
+							++prim_addr;
+							continue;
+						}
+#endif
+
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -198,9 +206,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         PATH_RAY_SHADOW,
 								                         object,
 								                         prim_addr);
@@ -309,12 +317,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
 					num_hits_in_instance = 0;
 					isect_array->t = isect_t;
 
@@ -354,22 +361,17 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
 
-				triangle_intersect_precalc(dir, &isect_precalc);
-
 				/* scale isect->t to adjust for instancing */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
-
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
 			isect_t = tmax;
@@ -400,6 +402,7 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
+                                         const int skip_object,
                                          const uint max_hits,
                                          uint *num_hits)
 {
@@ -408,6 +411,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(QBVH)(kg,
 		                                    ray,
 		                                    isect_array,
+		                                    skip_object,
 		                                    max_hits,
 		                                    num_hits);
 	}
@@ -418,6 +422,7 @@ ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
 		return BVH_FUNCTION_FULL_NAME(BVH)(kg,
 		                                   ray,
 		                                   isect_array,
+		                                   skip_object,
 		                                   max_hits,
 		                                   num_hits);
 	}
diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h
index 889bbca21e2..bda7e34907a 100644
--- a/intern/cycles/kernel/bvh/bvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_subsurface.h"
+#  include "kernel/bvh/qbvh_subsurface.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -75,16 +75,16 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
+		isect_t = bvh_instance_motion_push(kg,
+		                                   subsurface_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
 #else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+		isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t);
 #endif
 		object = subsurface_object;
 	}
@@ -109,9 +109,6 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -197,9 +194,9 @@ void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
 							                              ss_isect,
 							                              P,
+							                              dir,
 							                              object,
 							                              prim_addr,
 							                              isect_t,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 80c8f31473a..c58d3b0316c 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_traversal.h"
+#  include "kernel/bvh/qbvh_traversal.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -104,9 +104,6 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -238,9 +235,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
-								                      &isect_precalc,
 								                      isect,
 								                      P,
+								                      dir,
 								                      visibility,
 								                      object,
 								                      prim_addr))
@@ -354,11 +351,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+					isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+					isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
-					triangle_intersect_precalc(dir, &isect_precalc);
 
 #  if defined(__KERNEL_SSE2__)
 					Psplat[0] = ssef(P.x);
@@ -391,11 +387,10 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
-			triangle_intersect_precalc(dir, &isect_precalc);
 
 #  if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 57e5b8d736d..764aaee44a1 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_volume.h"
+#  include "kernel/bvh/qbvh_volume.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -97,9 +97,6 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -194,9 +191,9 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									continue;
 								}
 								triangle_intersect(kg,
-								                   &isect_precalc,
 								                   isect,
 								                   P,
+								                   dir,
 								                   visibility,
 								                   object,
 								                   prim_addr);
@@ -238,13 +235,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
-
 #  if defined(__KERNEL_SSE2__)
 						Psplat[0] = ssef(P.x);
 						Psplat[1] = ssef(P.y);
@@ -281,13 +276,11 @@ bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 
 			/* instance pop */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 #  if defined(__KERNEL_SSE2__)
 			Psplat[0] = ssef(P.x);
 			Psplat[1] = ssef(P.y);
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 5a1accebaa0..04ec334e54d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -18,7 +18,7 @@
  */
 
 #ifdef __QBVH__
-#  include "qbvh_volume_all.h"
+#  include "kernel/bvh/qbvh_volume_all.h"
 #endif
 
 #if BVH_FEATURE(BVH_HAIR)
@@ -101,9 +101,6 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 	gen_idirsplat_swap(pn, shuf_identity, shuf_swap, idir, idirsplat, shufflexyz);
 #endif  /* __KERNEL_SSE2__ */
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* traversal loop */
 	do {
 		do {
@@ -199,9 +196,9 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 									continue;
 								}
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         visibility,
 								                         object,
 								                         prim_addr);
@@ -288,14 +285,12 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
-
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
@@ -341,20 +336,17 @@ uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
 #  else
 				bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 				/* Scale isect->t to adjust for instancing. */
 				for(int i = 0; i < num_hits_in_instance; i++) {
 					(isect_array-i-1)->t *= t_fac;
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
-				triangle_intersect_precalc(dir, &isect_precalc);
 			}
 
 			isect_t = tmax;
diff --git a/intern/cycles/kernel/bvh/qbvh_shadow_all.h b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
index 607295f9ed5..ce474438f2c 100644
--- a/intern/cycles/kernel/bvh/qbvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_shadow_all.h
@@ -33,6 +33,7 @@
 ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
                                              const Ray *ray,
                                              Intersection *isect_array,
+                                             const int skip_object,
                                              const uint max_hits,
                                              uint *num_hits)
 {
@@ -96,15 +97,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
 
 				if(false
 #ifdef __VISIBILITY_FLAG__
@@ -270,6 +269,16 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					while(prim_addr < prim_addr2) {
 						kernel_assert((kernel_tex_fetch(__prim_type, prim_addr) & PRIMITIVE_ALL) == p_type);
 
+#ifdef __SHADOW_TRICKS__
+						uint tri_object = (object == OBJECT_NONE)
+						        ? kernel_tex_fetch(__prim_object, prim_addr)
+						        : object;
+						if(tri_object == skip_object) {
+							++prim_addr;
+							continue;
+						}
+#endif
+
 						bool hit;
 
 						/* todo: specialized intersect functions which don't fill in
@@ -279,9 +288,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						switch(p_type) {
 							case PRIMITIVE_TRIANGLE: {
 								hit = triangle_intersect(kg,
-								                         &isect_precalc,
 								                         isect_array,
 								                         P,
+								                         dir,
 								                         PATH_RAY_SHADOW,
 								                         object,
 								                         prim_addr);
@@ -390,9 +399,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					object = kernel_tex_fetch(__prim_object, -prim_addr-1);
 
 #  if BVH_FEATURE(BVH_MOTION)
-					bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+					isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-					bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+					isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
 					num_hits_in_instance = 0;
@@ -414,8 +423,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
-
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -445,11 +452,10 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
 			}
 
@@ -472,8 +478,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/bvh/qbvh_subsurface.h b/intern/cycles/kernel/bvh/qbvh_subsurface.h
index 84dc4003133..be7658d11d7 100644
--- a/intern/cycles/kernel/bvh/qbvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/qbvh_subsurface.h
@@ -64,16 +64,16 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	if(!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
 		Transform ob_itfm;
-		bvh_instance_motion_push(kg,
-		                         subsurface_object,
-		                         ray,
-		                         &P,
-		                         &dir,
-		                         &idir,
-		                         &isect_t,
-		                         &ob_itfm);
+		isect_t = bvh_instance_motion_push(kg,
+		                                   subsurface_object,
+		                                   ray,
+		                                   &P,
+		                                   &dir,
+		                                   &idir,
+		                                   isect_t,
+		                                   &ob_itfm);
 #else
-		bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, &isect_t);
+		isect_t = bvh_instance_push(kg, subsurface_object, ray, &P, &dir, &idir, isect_t);
 #endif
 		object = subsurface_object;
 	}
@@ -105,9 +105,6 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -253,9 +250,9 @@ ccl_device void BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						for(; prim_addr < prim_addr2; prim_addr++) {
 							kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 							triangle_intersect_subsurface(kg,
-							                              &isect_precalc,
 							                              ss_isect,
 							                              P,
+							                              dir,
 							                              object,
 							                              prim_addr,
 							                              isect_t,
diff --git a/intern/cycles/kernel/bvh/qbvh_traversal.h b/intern/cycles/kernel/bvh/qbvh_traversal.h
index 10ae7bee852..fca75a1d416 100644
--- a/intern/cycles/kernel/bvh/qbvh_traversal.h
+++ b/intern/cycles/kernel/bvh/qbvh_traversal.h
@@ -106,15 +106,13 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
 			/* Traverse internal nodes. */
 			while(node_addr >= 0 && node_addr != ENTRYPOINT_SENTINEL) {
 				float4 inodes = kernel_tex_fetch(__bvh_nodes, node_addr+0);
+				(void)inodes;
 
 				if(UNLIKELY(node_dist > isect->t)
 #if BVH_FEATURE(BVH_MOTION)
@@ -122,8 +120,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				   || UNLIKELY(ray->time > inodes.z)
 #endif
 #ifdef __VISIBILITY_FLAG__
-				   || (__float_as_uint(inodes.x) & visibility) == 0)
+				   || (__float_as_uint(inodes.x) & visibility) == 0
 #endif
+				 )
 				{
 					/* Pop. */
 					node_addr = traversal_stack[stack_ptr].addr;
@@ -333,9 +332,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 								BVH_DEBUG_NEXT_INTERSECTION();
 								kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
 								if(triangle_intersect(kg,
-								                      &isect_precalc,
 								                      isect,
 								                      P,
+								                      dir,
 								                      visibility,
 								                      object,
 								                      prim_addr)) {
@@ -447,8 +446,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-					triangle_intersect_precalc(dir, &isect_precalc);
-
 					++stack_ptr;
 					kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 					traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -468,9 +465,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 			qbvh_near_far_idx_calc(idir,
@@ -489,8 +486,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			node_dist = traversal_stack[stack_ptr].dist;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume.h b/intern/cycles/kernel/bvh/qbvh_volume.h
index dc6627e2dbb..192ce009524 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume.h
@@ -91,9 +91,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -266,7 +263,7 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								triangle_intersect(kg, &isect_precalc, isect, P, visibility, object, prim_addr);
+								triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
 							}
 							break;
 						}
@@ -295,9 +292,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+						isect->t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect->t);
+						isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 						qbvh_near_far_idx_calc(idir,
@@ -316,8 +313,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
-
 						++stack_ptr;
 						kernel_assert(stack_ptr < BVH_QSTACK_SIZE);
 						traversal_stack[stack_ptr].addr = ENTRYPOINT_SENTINEL;
@@ -341,9 +336,9 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 
 			/* Instance pop. */
 #  if BVH_FEATURE(BVH_MOTION)
-			bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &isect->t, &ob_itfm);
+			isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
 #  else
-			bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &isect->t);
+			isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #  endif
 
 			qbvh_near_far_idx_calc(idir,
@@ -362,8 +357,6 @@ ccl_device bool BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/bvh/qbvh_volume_all.h b/intern/cycles/kernel/bvh/qbvh_volume_all.h
index ff1fa92af6e..ac5f58a9a51 100644
--- a/intern/cycles/kernel/bvh/qbvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/qbvh_volume_all.h
@@ -95,9 +95,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 	                       &near_x, &near_y, &near_z,
 	                       &far_x, &far_y, &far_z);
 
-	IsectPrecalc isect_precalc;
-	triangle_intersect_precalc(dir, &isect_precalc);
-
 	/* Traversal loop. */
 	do {
 		do {
@@ -271,7 +268,7 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 									continue;
 								}
 								/* Intersect ray against primitive. */
-								hit = triangle_intersect(kg, &isect_precalc, isect_array, P, visibility, object, prim_addr);
+								hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
 								if(hit) {
 									/* Move on to next entry in intersections array. */
 									isect_array++;
@@ -346,9 +343,9 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 					int object_flag = kernel_tex_fetch(__object_flag, object);
 					if(object_flag & SD_OBJECT_HAS_VOLUME) {
 #  if BVH_FEATURE(BVH_MOTION)
-						bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &isect_t, &ob_itfm);
+						isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
 #  else
-						bvh_instance_push(kg, object, ray, &P, &dir, &idir, &isect_t);
+						isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
 #  endif
 
 						qbvh_near_far_idx_calc(idir,
@@ -367,7 +364,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 						org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-						triangle_intersect_precalc(dir, &isect_precalc);
 						num_hits_in_instance = 0;
 						isect_array->t = isect_t;
 
@@ -406,11 +402,10 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 				}
 			}
 			else {
-				float ignore_t = FLT_MAX;
 #  if BVH_FEATURE(BVH_MOTION)
-				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, &ignore_t, &ob_itfm);
+				bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #  else
-				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, &ignore_t);
+				bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #  endif
 			}
 
@@ -433,8 +428,6 @@ ccl_device uint BVH_FUNCTION_FULL_NAME(QBVH)(KernelGlobals *kg,
 			org4 = sse3f(ssef(P.x), ssef(P.y), ssef(P.z));
 #  endif
 
-			triangle_intersect_precalc(dir, &isect_precalc);
-
 			object = OBJECT_NONE;
 			node_addr = traversal_stack[stack_ptr].addr;
 			--stack_ptr;
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index b7abc1ec507..e799855a65e 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -20,17 +20,17 @@ ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType ty
 {
 	kernel_assert(size <= sizeof(ShaderClosure));
 
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra);
+	int num_closure = sd->num_closure;
+	int num_closure_extra = sd->num_closure_extra;
 	if(num_closure + num_closure_extra >= MAX_CLOSURE)
 		return NULL;
 
-	ShaderClosure *sc = &ccl_fetch(sd, closure)[num_closure];
+	ShaderClosure *sc = &sd->closure[num_closure];
 
 	sc->type = type;
 	sc->weight = weight;
 
-	ccl_fetch(sd, num_closure)++;
+	sd->num_closure++;
 
 	return sc;
 }
@@ -44,25 +44,25 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
 	 * This lets us keep the same fast array iteration over closures, as we
 	 * found linked list iteration and iteration with skipping to be slower. */
 	int num_extra = ((size + sizeof(ShaderClosure) - 1) / sizeof(ShaderClosure));
-	int num_closure = ccl_fetch(sd, num_closure);
-	int num_closure_extra = ccl_fetch(sd, num_closure_extra) + num_extra;
+	int num_closure = sd->num_closure;
+	int num_closure_extra = sd->num_closure_extra + num_extra;
 
 	if(num_closure + num_closure_extra > MAX_CLOSURE) {
 		/* Remove previous closure. */
-		ccl_fetch(sd, num_closure)--;
-		ccl_fetch(sd, num_closure_extra)++;
+		sd->num_closure--;
+		sd->num_closure_extra++;
 		return NULL;
 	}
 
-	ccl_fetch(sd, num_closure_extra) = num_closure_extra;
-	return (ccl_addr_space void*)(ccl_fetch(sd, closure) + MAX_CLOSURE - num_closure_extra);
+	sd->num_closure_extra = num_closure_extra;
+	return (ccl_addr_space void*)(sd->closure + MAX_CLOSURE - num_closure_extra);
 }
 
 ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
 {
 	ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
 
-	if(!sc)
+	if(sc == NULL)
 		return NULL;
 
 	float sample_weight = fabsf(average(weight));
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 04f9e711c7e..0302fa9b43e 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,77 +14,77 @@
  * limitations under the License.
  */
 
-#include "../closure/bsdf_ashikhmin_velvet.h"
-#include "../closure/bsdf_diffuse.h"
-#include "../closure/bsdf_oren_nayar.h"
-#include "../closure/bsdf_phong_ramp.h"
-#include "../closure/bsdf_diffuse_ramp.h"
-#include "../closure/bsdf_microfacet.h"
-#include "../closure/bsdf_microfacet_multi.h"
-#include "../closure/bsdf_reflection.h"
-#include "../closure/bsdf_refraction.h"
-#include "../closure/bsdf_transparent.h"
-#include "../closure/bsdf_ashikhmin_shirley.h"
-#include "../closure/bsdf_toon.h"
-#include "../closure/bsdf_hair.h"
-#include "../closure/bsdf_principled_diffuse.h"
-#include "../closure/bsdf_principled_sheen.h"
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
 #ifdef __SUBSURFACE__
-#  include "../closure/bssrdf.h"
+#  include "kernel/closure/bssrdf.h"
 #endif
 #ifdef __VOLUME__
-#  include "../closure/volume.h"
+#  include "kernel/closure/volume.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
 
 ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  const ShaderClosure *sc,
-                                  float randu,
-                                  float randv,
-                                  float3 *eval,
-                                  float3 *omega_in,
-                                  differential3 *domega_in,
-                                  float *pdf)
+                                       ShaderData *sd,
+                                       const ShaderClosure *sc,
+                                       float randu,
+                                       float randv,
+                                       float3 *eval,
+                                       float3 *omega_in,
+                                       differential3 *domega_in,
+                                       float *pdf)
 {
 	int label;
 
 	switch(sc->type) {
 		case CLOSURE_BSDF_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_ID:
-			label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __SVM__
 		case CLOSURE_BSDF_OREN_NAYAR_ID:
-			label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #ifdef __OSL__
 		case CLOSURE_BSDF_PHONG_RAMP_ID:
-			label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-			label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		case CLOSURE_BSDF_TRANSLUCENT_ID:
-			label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFLECTION_ID:
-			label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_REFRACTION_ID:
-			label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_TRANSPARENT_ID:
-			label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_GGX_ID:
@@ -93,63 +93,63 @@ ccl_device_forceinline int bsdf_sample(KernelGlobals *kg,
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 		case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-			label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-			label = bsdf_microfacet_multi_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+			label = bsdf_microfacet_multi_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
 		case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
-			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &ccl_fetch(sd, lcg_state));
+			label = bsdf_microfacet_multi_ggx_glass_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+			        eval, omega_in,  &domega_in->dx, &domega_in->dy, pdf, &sd->lcg_state);
 			break;
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 		case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-			label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 		case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-			label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-			label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-			label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_GLOSSY_TOON_ID:
-			label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-			label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-			label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
 		case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-			label = bsdf_principled_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_principled_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 		case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
-			label = bsdf_principled_sheen_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
+			label = bsdf_principled_sheen_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
 				eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 #ifdef __VOLUME__
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-			label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+			label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
 			break;
 #endif
 		default:
@@ -173,35 +173,35 @@ float3 bsdf_eval(KernelGlobals *kg,
 {
 	float3 eval;
 
-	if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
+	if(dot(sd->Ng, omega_in) >= 0.0f) {
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __OSL__
 			case CLOSURE_BSDF_PHONG_RAMP_ID:
-				eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-				eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
@@ -209,51 +209,51 @@ float3 bsdf_eval(KernelGlobals *kg,
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_glass_eval_reflect(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-				eval = bsdf_principled_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_principled_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
-				eval = bsdf_principled_sheen_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_principled_sheen_eval_reflect(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
@@ -265,23 +265,23 @@ float3 bsdf_eval(KernelGlobals *kg,
 		switch(sc->type) {
 			case CLOSURE_BSDF_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_ID:
-				eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #ifdef __SVM__
 			case CLOSURE_BSDF_OREN_NAYAR_ID:
-				eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSLUCENT_ID:
-				eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFLECTION_ID:
-				eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_REFRACTION_ID:
-				eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_TRANSPARENT_ID:
-				eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
@@ -289,51 +289,51 @@ float3 bsdf_eval(KernelGlobals *kg,
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_ANISO_FRESNEL_ID:
 			case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-				eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
 			case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf, &ccl_fetch(sd, lcg_state));
+				eval = bsdf_microfacet_multi_ggx_glass_eval_transmit(sc, sd->I, omega_in, pdf, &sd->lcg_state);
 				break;
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
 			case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-				eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
 			case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-				eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_GLOSSY_TOON_ID:
-				eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-				eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-				eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
 			case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-				eval = bsdf_principled_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_principled_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 			case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
-				eval = bsdf_principled_sheen_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = bsdf_principled_sheen_eval_transmit(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 #ifdef __VOLUME__
 			case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-				eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
+				eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
 				break;
 #endif
 			default:
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index b74e8ab97cf..58f6140970d 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -327,7 +327,7 @@ ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosur
 	       (bsdf_a->alpha_y == bsdf_b->alpha_y) &&
 	       (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
 	       (bsdf_a->ior == bsdf_b->ior) &&
-	       ((!bsdf_a->extra && !bsdf_b->extra) ||
+	       ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
 	        ((bsdf_a->extra && bsdf_b->extra) &&
 	         (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color))));
 }
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index a49b0717a3d..57f1e733ee7 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -43,7 +43,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
 	if(cosI > 0.9999f || cosI < 1e-6f) {
-		const float r = sqrtf(randU.x / (1.0f - randU.x));
+		const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
 		const float phi = M_2PI_F * randU.y;
 		return make_float2(r*cosf(phi), r*sinf(phi));
 	}
@@ -83,7 +83,7 @@ ccl_device_forceinline float3 mf_sample_vndf(const float3 wi, const float2 alpha
 	const float3 wi_11 = normalize(make_float3(alpha.x*wi.x, alpha.y*wi.y, wi.z));
 	const float2 slope_11 = mf_sampleP22_11(wi_11.z, randU);
 
-	const float2 cossin_phi = normalize(make_float2(wi_11.x, wi_11.y));
+	const float3 cossin_phi = safe_normalize(make_float3(wi_11.x, wi_11.y, 0.0f));
 	const float slope_x = alpha.x*(cossin_phi.x * slope_11.x - cossin_phi.y * slope_11.y);
 	const float slope_y = alpha.y*(cossin_phi.y * slope_11.x + cossin_phi.x * slope_11.y);
 
@@ -313,18 +313,18 @@ ccl_device_forceinline float mf_glass_pdf(const float3 wi, const float3 wo, cons
 
 #define MF_PHASE_FUNCTION glass
 #define MF_MULTI_GLASS
-#include "bsdf_microfacet_multi_impl.h"
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
 /* The diffuse phase function is not implemented as a node yet. */
 #if 0
 #define MF_PHASE_FUNCTION diffuse
 #define MF_MULTI_DIFFUSE
-#include "bsdf_microfacet_multi_impl.h"
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 #endif
 
 #define MF_PHASE_FUNCTION glossy
 #define MF_MULTI_GLOSSY
-#include "bsdf_microfacet_multi_impl.h"
+#include "kernel/closure/bsdf_microfacet_multi_impl.h"
 
 ccl_device void bsdf_microfacet_multi_ggx_blur(ShaderClosure *sc, float roughness)
 {
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 6838e26c242..c623e3490fd 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "geom_attribute.h"
-#include "geom_object.h"
+#include "kernel/geom/geom_attribute.h"
+#include "kernel/geom/geom_object.h"
 #ifdef __PATCH_EVAL__
-#  include "geom_patch.h"
+#  include "kernel/geom/geom_patch.h"
 #endif
-#include "geom_triangle.h"
-#include "geom_subd_triangle.h"
-#include "geom_triangle_intersect.h"
-#include "geom_motion_triangle.h"
-#include "geom_motion_triangle_intersect.h"
-#include "geom_motion_triangle_shader.h"
-#include "geom_motion_curve.h"
-#include "geom_curve.h"
-#include "geom_volume.h"
-#include "geom_primitive.h"
+#include "kernel/geom/geom_triangle.h"
+#include "kernel/geom/geom_subd_triangle.h"
+#include "kernel/geom/geom_triangle_intersect.h"
+#include "kernel/geom/geom_motion_triangle.h"
+#include "kernel/geom/geom_motion_triangle_intersect.h"
+#include "kernel/geom/geom_motion_triangle_shader.h"
+#include "kernel/geom/geom_motion_curve.h"
+#include "kernel/geom/geom_curve.h"
+#include "kernel/geom/geom_volume.h"
+#include "kernel/geom/geom_primitive.h"
 
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index 08ccee56335..cc62192ef21 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -30,7 +30,7 @@ ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *
 ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return ATTR_PRIM_CURVE;
 	}
 	else
@@ -53,12 +53,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id)
 {
-	if(ccl_fetch(sd, object) == PRIM_NONE) {
+	if(sd->object == PRIM_NONE) {
 		return attribute_not_found();
 	}
 
 	/* for SVM, find attribute by unique id */
-	uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
+	uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
 	attr_offset += attribute_primitive_type(kg, sd);
 	uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
 	
@@ -73,7 +73,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg, const Sh
 	AttributeDescriptor desc;
 	desc.element = (AttributeElement)attr_map.y;
 	
-	if(ccl_fetch(sd, prim) == PRIM_NONE &&
+	if(sd->prim == PRIM_NONE &&
 	   desc.element != ATTR_ELEMENT_MESH &&
 	   desc.element != ATTR_ELEMENT_VOXEL &&
 	   desc.element != ATTR_ELEMENT_OBJECT)
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index 9de335403ce..bb33b91847e 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -22,6 +22,12 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __HAIR__
 
+#if defined(__KERNEL_CUDA__) && (__CUDA_ARCH__ < 300)
+#  define ccl_device_curveintersect ccl_device
+#else
+#  define ccl_device_curveintersect ccl_device_forceinline
+#endif
+
 /* Reading attributes on various curve elements */
 
 ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
@@ -32,22 +38,22 @@ ccl_device float curve_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 		if(dy) *dy = 0.0f;
 #endif
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + k0);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + k1);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = 0.0f;
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -71,22 +77,22 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg, const ShaderData *sd
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_CURVE_KEY || desc.element == ATTR_ELEMENT_CURVE_KEY_MOTION) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*(f1 - f0);
+		if(dx) *dx = sd->du.dx*(f1 - f0);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
-		return (1.0f - ccl_fetch(sd, u))*f0 + ccl_fetch(sd, u)*f1;
+		return (1.0f - sd->u)*f0 + sd->u*f1;
 	}
 	else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -104,22 +110,22 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 {
 	float r = 0.0f;
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+		int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 		int k1 = k0 + 1;
 
 		float4 P_curve[2];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
 		}
 
-		r = (P_curve[1].w - P_curve[0].w) * ccl_fetch(sd, u) + P_curve[0].w;
+		r = (P_curve[1].w - P_curve[0].w) * sd->u + P_curve[0].w;
 	}
 
 	return r*2.0f;
@@ -130,8 +136,8 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 
 ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
 {
-	float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
-	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
+	int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 	int k1 = k0 + 1;
 
 	float4 P_curve[2];
@@ -139,7 +145,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 	P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 	P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 
-	return float4_to_float3(P_curve[1]) * ccl_fetch(sd, u) + float4_to_float3(P_curve[0]) * (1.0f - ccl_fetch(sd, u));
+	return float4_to_float3(P_curve[1]) * sd->u + float4_to_float3(P_curve[0]) * (1.0f - sd->u);
 }
 
 /* Curve tangent normal */
@@ -148,14 +154,14 @@ ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
 {	
 	float3 tgN = make_float3(0.0f,0.0f,0.0f);
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 
-		tgN = -(-ccl_fetch(sd, I) - ccl_fetch(sd, dPdu) * (dot(ccl_fetch(sd, dPdu),-ccl_fetch(sd, I)) / len_squared(ccl_fetch(sd, dPdu))));
+		tgN = -(-sd->I - sd->dPdu * (dot(sd->dPdu,-sd->I) / len_squared(sd->dPdu)));
 		tgN = normalize(tgN);
 
 		/* need to find suitable scaled gd for corrected normal */
 #if 0
-		tgN = normalize(tgN - gd * ccl_fetch(sd, dPdu));
+		tgN = normalize(tgN - gd * sd->dPdu);
 #endif
 	}
 
@@ -222,13 +228,22 @@ ccl_device_inline ssef transform_point_T3(const ssef t[3], const ssef &a)
 
 #ifdef __KERNEL_SSE2__
 /* Pass P and dir by reference to aligned vector */
-ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	const float3 &P, const float3 &dir, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 #else
-ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_curveintersect bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 dir, uint visibility, int object, int curveAddr, float time,int type, uint *lcg_state, float difl, float extmax)
 #endif
 {
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
 	float epsilon = 0.0f;
 	float r_st, r_en;
@@ -255,9 +270,9 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		int ka = max(k0 - 1, v00.x);
 		int kb = min(k1 + 1, v00.x + v00.y - 1);
 
-#ifdef __KERNEL_AVX2__
+#if defined(__KERNEL_AVX2__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
 		avxf P_curve_0_1, P_curve_2_3;
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve_0_1 = _mm256_loadu2_m128(&kg->__curve_keys.data[k0].x, &kg->__curve_keys.data[ka].x);
 			P_curve_2_3 = _mm256_loadu2_m128(&kg->__curve_keys.data[kb].x, &kg->__curve_keys.data[k1].x);
 		}
@@ -268,7 +283,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 #else  /* __KERNEL_AVX2__ */
 		ssef P_curve[4];
 
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve[0] = load4f(&kg->__curve_keys.data[ka].x);
 			P_curve[1] = load4f(&kg->__curve_keys.data[k0].x);
 			P_curve[2] = load4f(&kg->__curve_keys.data[k1].x);
@@ -290,7 +305,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 		ssef htfm1 = shuffle<1, 0, 1, 3>(load1f_first(extract<0>(d_ss)), vdir0);
 		ssef htfm2 = shuffle<1, 3, 2, 3>(mul_shuf, vdir0);
 
-#ifdef __KERNEL_AVX2__
+#if defined(__KERNEL_AVX2__) && (!defined(_MSC_VER) || _MSC_VER > 1800)
 		const avxf vPP = _mm256_broadcast_ps(&P.m128);
 		const avxf htfm00 = avxf(htfm0.m128, htfm0.m128);
 		const avxf htfm11 = avxf(htfm1.m128, htfm1.m128);
@@ -363,7 +378,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 
 		float4 P_curve[4];
 
-		if(type & PRIMITIVE_CURVE) {
+		if(is_curve_primitive) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
@@ -679,7 +694,7 @@ ccl_device_forceinline bool bvh_cardinal_curve_intersect(KernelGlobals *kg, Inte
 	return hit;
 }
 
-ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
+ccl_device_curveintersect bool bvh_curve_intersect(KernelGlobals *kg, Intersection *isect,
 	float3 P, float3 direction, uint visibility, int object, int curveAddr, float time, int type, uint *lcg_state, float difl, float extmax)
 {
 	/* define few macros to minimize code duplication for SSE */
@@ -689,6 +704,15 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #  define dot3(x, y) dot(x, y)
 #endif
 
+	const bool is_curve_primitive = (type & PRIMITIVE_CURVE);
+
+	if(!is_curve_primitive && kernel_data.bvh.use_bvh_steps) {
+		const float2 prim_time = kernel_tex_fetch(__prim_time, curveAddr);
+		if(time < prim_time.x || time > prim_time.y) {
+			return false;
+		}
+	}
+
 	int segment = PRIMITIVE_UNPACK_SEGMENT(type);
 	/* curve Intersection check */
 	int flags = kernel_data.curve.curveflags;
@@ -703,7 +727,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #ifndef __KERNEL_SSE2__
 	float4 P_curve[2];
 
-	if(type & PRIMITIVE_CURVE) {
+	if(is_curve_primitive) {
 		P_curve[0] = kernel_tex_fetch(__curve_keys, k0);
 		P_curve[1] = kernel_tex_fetch(__curve_keys, k1);
 	}
@@ -738,7 +762,7 @@ ccl_device_forceinline bool bvh_curve_intersect(KernelGlobals *kg, Intersection
 #else
 	ssef P_curve[2];
 	
-	if(type & PRIMITIVE_CURVE) {
+	if(is_curve_primitive) {
 		P_curve[0] = load4f(&kg->__curve_keys.data[k0].x);
 		P_curve[1] = load4f(&kg->__curve_keys.data[k1].x);
 	}
@@ -948,7 +972,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #endif
@@ -961,7 +985,7 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 	int prim = kernel_tex_fetch(__prim_index, isect->prim);
 	float4 v00 = kernel_tex_fetch(__curves, prim);
 
-	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(ccl_fetch(sd, type));
+	int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
 	int k1 = k0 + 1;
 
 	float3 tg;
@@ -972,14 +996,14 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 
 		float4 P_curve[4];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0] = kernel_tex_fetch(__curve_keys, ka);
 			P_curve[1] = kernel_tex_fetch(__curve_keys, k0);
 			P_curve[2] = kernel_tex_fetch(__curve_keys, k1);
 			P_curve[3] = kernel_tex_fetch(__curve_keys, kb);
 		}
 		else {
-			motion_cardinal_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), ka, k0, k1, kb, P_curve);
+			motion_cardinal_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
 		}
 
 		float3 p[4];
@@ -991,43 +1015,43 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		P = P + D*t;
 
 #ifdef __UV__
-		ccl_fetch(sd, u) = isect->u;
-		ccl_fetch(sd, v) = 0.0f;
+		sd->u = isect->u;
+		sd->v = 0.0f;
 #endif
 
 		tg = normalize(curvetangent(isect->u, p[0], p[1], p[2], p[3]));
 
 		if(kernel_data.curve.curveflags & CURVE_KN_RIBBONS) {
-			ccl_fetch(sd, Ng) = normalize(-(D - tg * (dot(tg, D))));
+			sd->Ng = normalize(-(D - tg * (dot(tg, D))));
 		}
 		else {
 			/* direction from inside to surface of curve */
 			float3 p_curr = curvepoint(isect->u, p[0], p[1], p[2], p[3]);	
-			ccl_fetch(sd, Ng) = normalize(P - p_curr);
+			sd->Ng = normalize(P - p_curr);
 
 			/* adjustment for changing radius */
 			float gd = isect->v;
 
 			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
 			}
 		}
 
 		/* todo: sometimes the normal is still so that this is detected as
 		 * backfacing even if cull backfaces is enabled */
 
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
+		sd->N = sd->Ng;
 	}
 	else {
 		float4 P_curve[2];
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_CURVE) {
+		if(sd->type & PRIMITIVE_CURVE) {
 			P_curve[0]= kernel_tex_fetch(__curve_keys, k0);
 			P_curve[1]= kernel_tex_fetch(__curve_keys, k1);
 		}
 		else {
-			motion_curve_keys(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), k0, k1, P_curve);
+			motion_curve_keys(kg, sd->object, sd->prim, sd->time, k0, k1, P_curve);
 		}
 
 		float l = 1.0f;
@@ -1038,39 +1062,39 @@ ccl_device_inline float3 bvh_curve_refine(KernelGlobals *kg, ShaderData *sd, con
 		float3 dif = P - float4_to_float3(P_curve[0]);
 
 #ifdef __UV__
-		ccl_fetch(sd, u) = dot(dif,tg)/l;
-		ccl_fetch(sd, v) = 0.0f;
+		sd->u = dot(dif,tg)/l;
+		sd->v = 0.0f;
 #endif
 
 		if(flag & CURVE_KN_TRUETANGENTGNORMAL) {
-			ccl_fetch(sd, Ng) = -(D - tg * dot(tg, D));
-			ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+			sd->Ng = -(D - tg * dot(tg, D));
+			sd->Ng = normalize(sd->Ng);
 		}
 		else {
 			float gd = isect->v;
 
 			/* direction from inside to surface of curve */
-			ccl_fetch(sd, Ng) = (dif - tg * ccl_fetch(sd, u) * l) / (P_curve[0].w + ccl_fetch(sd, u) * l * gd);
+			sd->Ng = (dif - tg * sd->u * l) / (P_curve[0].w + sd->u * l * gd);
 
 			/* adjustment for changing radius */
 			if(gd != 0.0f) {
-				ccl_fetch(sd, Ng) = ccl_fetch(sd, Ng) - gd * tg;
-				ccl_fetch(sd, Ng) = normalize(ccl_fetch(sd, Ng));
+				sd->Ng = sd->Ng - gd * tg;
+				sd->Ng = normalize(sd->Ng);
 			}
 		}
 
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
+		sd->N = sd->Ng;
 	}
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = tg;
-	ccl_fetch(sd, dPdv) = cross(tg, ccl_fetch(sd, Ng));
+	sd->dPdu = tg;
+	sd->dPdv = cross(tg, sd->Ng);
 #endif
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #endif
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index d57d74ea882..f74995becf5 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -48,7 +48,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -77,7 +77,7 @@ ccl_device_inline float3 motion_triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -116,7 +116,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 #  ifdef __INTERSECTION_REFINE__
 	if(isect->object != OBJECT_NONE) {
 #    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #    else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -144,7 +144,7 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #    ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #    else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -166,14 +166,15 @@ float3 motion_triangle_refine_subsurface(KernelGlobals *kg,
  * time and do a ray intersection with the resulting triangle.
  */
 
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
-                                                 Intersection *isect,
-                                                 float3 P,
-                                                 float3 dir,
-                                                 float time,
-                                                 uint visibility,
-                                                 int object,
-                                                 int prim_addr)
+ccl_device_inline bool motion_triangle_intersect(
+        KernelGlobals *kg,
+        Intersection *isect,
+        float3 P,
+        float3 dir,
+        float time,
+        uint visibility,
+        int object,
+        int prim_addr)
 {
 	/* Primitive index for vertex location lookup. */
 	int prim = kernel_tex_fetch(__prim_index, prim_addr);
@@ -185,11 +186,15 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
 	motion_triangle_vertices(kg, fobject, prim, time, verts);
 	/* Ray-triangle intersection, unoptimized. */
 	float t, u, v;
-	if(ray_triangle_intersect_uv(P,
-	                             dir,
-	                             isect->t,
-	                             verts[2], verts[0], verts[1],
-	                             &u, &v, &t))
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          isect->t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          (ssef*)verts,
+#else
+	                          verts[0], verts[1], verts[2],
+#endif
+	                          &u, &v, &t))
 	{
 #ifdef __VISIBILITY_FLAG__
 		/* Visibility flag test. we do it here under the assumption
@@ -237,11 +242,15 @@ ccl_device_inline void motion_triangle_intersect_subsurface(
 	motion_triangle_vertices(kg, fobject, prim, time, verts);
 	/* Ray-triangle intersection, unoptimized. */
 	float t, u, v;
-	if(ray_triangle_intersect_uv(P,
-	                             dir,
-	                             tmax,
-	                             verts[2], verts[0], verts[1],
-	                             &u, &v, &t))
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          tmax,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          (ssef*)verts,
+#else
+	                          verts[0], verts[1], verts[2],
+#endif
+	                          &u, &v, &t))
 	{
 		for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
 			if(ss_isect->hits[i].t == t) {
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 0e024a05db6..cb456056e20 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -39,26 +39,26 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
                                                       bool subsurface)
 {
 	/* Get shader. */
-	ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+	sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 	/* Get motion info. */
 	/* TODO(sergey): This logic is really similar to motion_triangle_vertices(),
 	 * can we de-duplicate something here?
 	 */
 	int numsteps, numverts;
-	object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
+	object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
 	/* Figure out which steps we need to fetch and their interpolation factor. */
 	int maxstep = numsteps*2;
-	int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
-	float t = ccl_fetch(sd, time)*maxstep - step;
+	int step = min((int)(sd->time*maxstep), maxstep-1);
+	float t = sd->time*maxstep - step;
 	/* Find attribute. */
 	AttributeElement elem;
-	int offset = find_attribute_motion(kg, ccl_fetch(sd, object),
+	int offset = find_attribute_motion(kg, sd->object,
 	                                   ATTR_STD_MOTION_VERTEX_POSITION,
 	                                   &elem);
 	kernel_assert(offset != ATTR_STD_NOT_FOUND);
 	/* Fetch vertex coordinates. */
 	float3 verts[3], next_verts[3];
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
 	motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
 	/* Interpolate between steps. */
@@ -68,7 +68,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 	/* Compute refined position. */
 #ifdef __SUBSURFACE__
 	if(subsurface) {
-		ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg,
+		sd->P = motion_triangle_refine_subsurface(kg,
 		                                                     sd,
 		                                                     isect,
 		                                                     ray,
@@ -77,29 +77,29 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 	else
 #endif  /*  __SUBSURFACE__*/
 	{
-		ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
+		sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
 	}
 	/* Compute face normal. */
 	float3 Ng;
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
 	}
 	else {
 		Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 	}
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, N) = Ng;
+	sd->Ng = Ng;
+	sd->N = Ng;
 	/* Compute derivatives of P w.r.t. uv. */
 #ifdef __DPDU__
-	ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
-	ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
+	sd->dPdu = (verts[0] - verts[2]);
+	sd->dPdv = (verts[1] - verts[2]);
 #endif
 	/* Compute smooth normal. */
-	if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+	if(sd->shader & SHADER_SMOOTH_NORMAL) {
 		/* Find attribute. */
 		AttributeElement elem;
 		int offset = find_attribute_motion(kg,
-		                                   ccl_fetch(sd, object),
+		                                   sd->object,
 		                                   ATTR_STD_MOTION_VERTEX_NORMAL,
 		                                   &elem);
 		kernel_assert(offset != ATTR_STD_NOT_FOUND);
@@ -112,10 +112,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg,
 		normals[1] = (1.0f - t)*normals[1] + t*next_normals[1];
 		normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
 		/* Interpolate between vertices. */
-		float u = ccl_fetch(sd, u);
-		float v = ccl_fetch(sd, v);
+		float u = sd->u;
+		float v = sd->v;
 		float w = 1.0f - u - v;
-		ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
+		sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
 	}
 }
 
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index f51b2d18657..6ecdfe0173a 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -137,9 +137,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_tfm), *P);
+	*P = transform_point_auto(&sd->ob_tfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -149,9 +149,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-	*P = transform_point_auto(&ccl_fetch(sd, ob_itfm), *P);
+	*P = transform_point_auto(&sd->ob_itfm, *P);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*P = transform_point(&tfm, *P);
 #endif
 }
@@ -161,12 +161,12 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	if((ccl_fetch(sd, object) != OBJECT_NONE) || (ccl_fetch(sd, type) == PRIMITIVE_LAMP)) {
-		*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_tfm), *N));
+	if((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+		*N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
 	}
 #else
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	if(sd->object != OBJECT_NONE) {
+		Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 		*N = normalize(transform_direction_transposed(&tfm, *N));
 	}
 #endif
@@ -177,9 +177,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-	*N = normalize(transform_direction_transposed_auto(&ccl_fetch(sd, ob_itfm), *N));
+	*N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -189,9 +189,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_tfm), *D);
+	*D = transform_direction_auto(&sd->ob_tfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -201,9 +201,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-	*D = transform_direction_auto(&ccl_fetch(sd, ob_itfm), *D);
+	*D = transform_direction_auto(&sd->ob_itfm, *D);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	*D = transform_direction(&tfm, *D);
 #endif
 }
@@ -212,13 +212,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-	if(ccl_fetch(sd, object) == OBJECT_NONE)
+	if(sd->object == OBJECT_NONE)
 		return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-	return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
+	return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
 #else
-	Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
+	Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
 	return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -326,7 +326,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-	return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE + 1);
+	return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE + 1);
 }
 
 /* Particle data from which object was instanced */
@@ -425,7 +425,13 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
+ccl_device_inline float bvh_instance_push(KernelGlobals *kg,
+                                          int object,
+                                          const Ray *ray,
+                                          float3 *P,
+                                          float3 *dir,
+                                          float3 *idir,
+                                          float t)
 {
 	Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -435,8 +441,11 @@ ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ra
 	*dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
 	*idir = bvh_inverse_direction(*dir);
 
-	if(*t != FLT_MAX)
-		*t *= len;
+	if(t != FLT_MAX) {
+		t *= len;
+	}
+
+	return t;
 }
 
 #ifdef __QBVH__
@@ -473,16 +482,24 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
 
 /* Transorm ray to exit static object in BVH */
 
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
+ccl_device_inline float bvh_instance_pop(KernelGlobals *kg,
+                                         int object,
+                                         const Ray *ray,
+                                         float3 *P,
+                                         float3 *dir,
+                                         float3 *idir,
+                                         float t)
 {
-	if(*t != FLT_MAX) {
+	if(t != FLT_MAX) {
 		Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-		*t /= len(transform_direction(&tfm, ray->D));
+		t /= len(transform_direction(&tfm, ray->D));
 	}
 
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
+
+	return t;
 }
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
@@ -501,13 +518,13 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg, int object, co
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
                                                 int object,
                                                 const Ray *ray,
                                                 float3 *P,
                                                 float3 *dir,
                                                 float3 *idir,
-                                                ccl_addr_space float *t,
+                                                float t,
                                                 Transform *itfm)
 {
 	object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -518,8 +535,11 @@ ccl_device_inline void bvh_instance_motion_push(KernelGlobals *kg,
 	*dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
 	*idir = bvh_inverse_direction(*dir);
 
-	if(*t != FLT_MAX)
-		*t *= len;
+	if(t != FLT_MAX) {
+		t *= len;
+	}
+
+	return t;
 }
 
 #ifdef __QBVH__
@@ -557,22 +577,24 @@ ccl_device_inline void qbvh_instance_motion_push(KernelGlobals *kg,
 
 /* Transorm ray to exit motion blurred object in BVH */
 
-ccl_device_inline void bvh_instance_motion_pop(KernelGlobals *kg,
-                                               int object,
-                                               const Ray *ray,
-                                               float3 *P,
-                                               float3 *dir,
-                                               float3 *idir,
-                                               ccl_addr_space float *t,
-                                               Transform *itfm)
-{
-	if(*t != FLT_MAX) {
-		*t /= len(transform_direction(itfm, ray->D));
+ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+                                                int object,
+                                                const Ray *ray,
+                                                float3 *P,
+                                                float3 *dir,
+                                                float3 *idir,
+                                                float t,
+                                                Transform *itfm)
+{
+	if(t != FLT_MAX) {
+		t /= len(transform_direction(itfm, ray->D));
 	}
 
 	*P = ray->P;
 	*dir = bvh_clamp_direction(ray->D);
 	*idir = bvh_inverse_direction(*dir);
+
+	return t;
 }
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 6a0ff5a4a04..5663b598508 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -267,7 +267,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg, const ShaderData *sd, int o
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float val = 0.0f;
@@ -294,7 +294,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
@@ -321,7 +321,7 @@ ccl_device float3 patch_eval_uchar4(KernelGlobals *kg, const ShaderData *sd, int
 	float weights_du[PATCH_MAX_CONTROL_VERTS];
 	float weights_dv[PATCH_MAX_CONTROL_VERTS];
 
-	int num_control = patch_eval_control_verts(kg, ccl_fetch(sd, object), patch, u, v, channel,
+	int num_control = patch_eval_control_verts(kg, sd->object, patch, u, v, channel,
 	                                           indices, weights, weights_du, weights_dv);
 
 	float3 val = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index 8a73bb2f78b..90a9c2147cc 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -28,19 +28,19 @@ ccl_device_inline float primitive_attribute_float(KernelGlobals *kg,
                                                   const AttributeDescriptor desc,
                                                   float *dx, float *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -56,19 +56,19 @@ ccl_device_inline float3 primitive_attribute_float3(KernelGlobals *kg,
                                                     const AttributeDescriptor desc,
                                                     float3 *dx, float3 *dy)
 {
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
+	if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
 		if(subd_triangle_patch(kg, sd) == ~0)
 			return triangle_attribute_float3(kg, sd, desc, dx, dy);
 		else
 			return subd_triangle_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #ifdef __HAIR__
-	else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	else if(sd->type & PRIMITIVE_ALL_CURVE) {
 		return curve_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, object) != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
+	else if(sd->object != OBJECT_NONE && desc.element == ATTR_ELEMENT_VOXEL) {
 		return volume_attribute_float3(kg, sd, desc, dx, dy);
 	}
 #endif
@@ -118,9 +118,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
+	if(sd->type & PRIMITIVE_ALL_CURVE)
 #  ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #  else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #  endif
@@ -133,12 +133,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 		float3 data = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 		data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
 		object_normal_transform(kg, sd, &data);
-		return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
+		return cross(sd->N, normalize(cross(data, sd->N)));
 	}
 	else {
 		/* otherwise use surface derivatives */
 #ifdef __DPDU__
-		return normalize(ccl_fetch(sd, dPdu));
+		return normalize(sd->dPdu);
 #else
 		return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -153,17 +153,17 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	float3 center;
 
 #ifdef __HAIR__
-	bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
+	bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
 	if(is_curve_primitive) {
 		center = curve_motion_center_location(kg, sd);
 
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &center);
 		}
 	}
 	else
 #endif
-		center = ccl_fetch(sd, P);
+		center = sd->P;
 
 	float3 motion_pre = center, motion_post = center;
 
@@ -173,16 +173,16 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	if(desc.offset != ATTR_STD_NOT_FOUND) {
 		/* get motion info */
 		int numverts, numkeys;
-		object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
+		object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
 
 		/* lookup attributes */
 		motion_pre = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
-		desc.offset += (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
+		desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE)? numverts: numkeys;
 		motion_post = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
 #ifdef __HAIR__
-		if(is_curve_primitive && (ccl_fetch(sd, object_flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+		if(is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
 			object_position_transform(kg, sd, &motion_pre);
 			object_position_transform(kg, sd, &motion_post);
 		}
@@ -193,10 +193,10 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *
 	 * transformation was set match the world/object space of motion_pre/post */
 	Transform tfm;
 	
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
+	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
 	motion_pre = transform_point(&tfm, motion_pre);
 
-	tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
+	tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
 	motion_post = transform_point(&tfm, motion_post);
 
 	float3 motion_center;
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 647840dc696..044e82f03d4 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -22,14 +22,14 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
 {
-	return (ccl_fetch(sd, prim) != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, ccl_fetch(sd, prim)) : ~0;
+	return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg, const ShaderData *sd, float2 uv[3])
 {
-	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 	uv[0] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.x);
 	uv[1] = kernel_tex_fetch(__tri_patch_uv, tri_vindex.y);
@@ -110,7 +110,7 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float a, dads, dadt;
 		a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -123,8 +123,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -132,8 +132,8 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -174,11 +174,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
 		float2 uv[3];
@@ -202,11 +202,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals *kg, const
 		float c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -229,7 +229,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float2 dpdv = uv[1] - uv[2];
 
 		/* p is [s, t] */
-		float2 p = dpdu * ccl_fetch(sd, u) + dpdv * ccl_fetch(sd, v) + uv[2];
+		float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
 
 		float3 a, dads, dadt;
 
@@ -248,8 +248,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 			float dtdv = dpdv.y;
 
 			if(dx) {
-				float dudx = ccl_fetch(sd, du).dx;
-				float dvdx = ccl_fetch(sd, dv).dx;
+				float dudx = sd->du.dx;
+				float dvdx = sd->dv.dx;
 
 				float dsdx = dsdu*dudx + dsdv*dvdx;
 				float dtdx = dtdu*dudx + dtdv*dvdx;
@@ -257,8 +257,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 				*dx = dads*dsdx + dadt*dtdx;
 			}
 			if(dy) {
-				float dudy = ccl_fetch(sd, du).dy;
-				float dvdy = ccl_fetch(sd, dv).dy;
+				float dudy = sd->du.dy;
+				float dvdy = sd->dv.dy;
 
 				float dsdy = dsdu*dudy + dsdv*dvdy;
 				float dtdy = dtdu*dudy + dtdv*dvdy;
@@ -299,11 +299,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
 		float2 uv[3];
@@ -337,11 +337,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg, con
 		float3 c = mix(mix(f0, f1, uv[2].x), mix(f3, f2, uv[2].x), uv[2].y);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*a + ccl_fetch(sd, dv).dx*b - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*c;
-		if(dy) *dy = ccl_fetch(sd, du).dy*a + ccl_fetch(sd, dv).dy*b - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*c;
+		if(dx) *dx = sd->du.dx*a + sd->dv.dx*b - (sd->du.dx + sd->dv.dx)*c;
+		if(dy) *dy = sd->du.dy*a + sd->dv.dy*b - (sd->du.dy + sd->dv.dy)*c;
 #endif
 
-		return ccl_fetch(sd, u)*a + ccl_fetch(sd, v)*b + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*c;
+		return sd->u*a + sd->v*b + (1.0f - sd->u - sd->v)*c;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index 3229091bbb0..47778553b94 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -26,13 +26,13 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
 	/* load triangle vertices */
-	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 	const float3 v0 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+0));
 	const float3 v1 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+1));
 	const float3 v2 = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w+2));
 
 	/* return normal */
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+	if(sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
 		return normalize(cross(v2 - v0, v1 - v0));
 	}
 	else {
@@ -110,34 +110,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
 		if(dx) *dx = 0.0f;
 		if(dy) *dy = 0.0f;
 
-		return kernel_tex_fetch(__attributes_float, desc.offset + ccl_fetch(sd, prim));
+		return kernel_tex_fetch(__attributes_float, desc.offset + sd->prim);
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float f0 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.x);
 		float f1 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.y);
 		float f2 = kernel_tex_fetch(__attributes_float, desc.offset + tri_vindex.z);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
 		float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
 		float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = 0.0f;
@@ -153,24 +153,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
 		if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + ccl_fetch(sd, prim)));
+		return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + sd->prim));
 	}
 	else if(desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
-		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
+		uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
 
 		float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
 		float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
 		float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else if(desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
-		int tri = desc.offset + ccl_fetch(sd, prim)*3;
+		int tri = desc.offset + sd->prim*3;
 		float3 f0, f1, f2;
 
 		if(desc.element == ATTR_ELEMENT_CORNER) {
@@ -185,11 +185,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
 		}
 
 #ifdef __RAY_DIFFERENTIALS__
-		if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
-		if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
+		if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
+		if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
 #endif
 
-		return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
+		return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
 	}
 	else {
 		if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index 4db121d94f4..804e74d7e37 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -22,232 +22,50 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Workaround stupidness of CUDA/OpenCL which doesn't allow to access indexed
- * component of float3 value.
- */
-#ifndef __KERNEL_CPU__
-#  define IDX(vec, idx) \
-    ((idx == 0) ? ((vec).x) : ( (idx == 1) ? ((vec).y) : ((vec).z) ))
-#else
-#  define IDX(vec, idx) ((vec)[idx])
-#endif
-
-/* Ray-Triangle intersection for BVH traversal
- *
- * Sven Woop
- * Watertight Ray/Triangle Intersection
- *
- * http://jcgt.org/published/0002/01/05/paper.pdf
- */
-
-/* Precalculated data for the ray->tri intersection. */
-typedef struct IsectPrecalc {
-	/* Maximal dimension kz, and orthogonal dimensions. */
-	int kx, ky, kz;
-
-	/* Shear constants. */
-	float Sx, Sy, Sz;
-} IsectPrecalc;
-
-#if (defined(__KERNEL_OPENCL_APPLE__)) || \
-    (defined(__KERNEL_CUDA__) && (defined(i386) || defined(_M_IX86)))
-ccl_device_noinline
-#else
-ccl_device_inline
-#endif
-void triangle_intersect_precalc(float3 dir,
-                                IsectPrecalc *isect_precalc)
-{
-	/* Calculate dimension where the ray direction is maximal. */
-#ifndef __KERNEL_SSE__
-	int kz = util_max_axis(make_float3(fabsf(dir.x),
-	                                   fabsf(dir.y),
-	                                   fabsf(dir.z)));
-	int kx = kz + 1; if(kx == 3) kx = 0;
-	int ky = kx + 1; if(ky == 3) ky = 0;
-#else
-	int kx, ky, kz;
-	/* Avoiding mispredicted branch on direction. */
-	kz = util_max_axis(fabs(dir));
-	static const char inc_xaxis[] = {1, 2, 0, 55};
-	static const char inc_yaxis[] = {2, 0, 1, 55};
-	kx = inc_xaxis[kz];
-	ky = inc_yaxis[kz];
-#endif
-
-	float dir_kz = IDX(dir, kz);
-
-	/* Swap kx and ky dimensions to preserve winding direction of triangles. */
-	if(dir_kz < 0.0f) {
-		int tmp = kx;
-		kx = ky;
-		ky = tmp;
-	}
-
-	/* Calculate the shear constants. */
-	float inv_dir_z = 1.0f / dir_kz;
-	isect_precalc->Sx = IDX(dir, kx) * inv_dir_z;
-	isect_precalc->Sy = IDX(dir, ky) * inv_dir_z;
-	isect_precalc->Sz = inv_dir_z;
-
-	/* Store the dimensions. */
-	isect_precalc->kx = kx;
-	isect_precalc->ky = ky;
-	isect_precalc->kz = kz;
-}
-
-/* TODO(sergey): Make it general utility function. */
-ccl_device_inline float xor_signmask(float x, int y)
-{
-	return __int_as_float(__float_as_int(x) ^ y);
-}
-
 ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
-                                          const IsectPrecalc *isect_precalc,
                                           Intersection *isect,
                                           float3 P,
+                                          float3 dir,
                                           uint visibility,
                                           int object,
                                           int prim_addr)
 {
-	const int kx = isect_precalc->kx;
-	const int ky = isect_precalc->ky;
-	const int kz = isect_precalc->kz;
-	const float Sx = isect_precalc->Sx;
-	const float Sy = isect_precalc->Sy;
-	const float Sz = isect_precalc->Sz;
-
-	/* Calculate vertices relative to ray origin. */
 	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-	const avxf avxf_P(P.m128, P.m128);
-
-	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
-	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
-
-	const avxf AB = tri_ab - avxf_P;
-	const avxf BC = tri_bc - avxf_P;
-
-	const __m256i permute_mask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
-
-	const avxf AB_k = shuffle(AB, permute_mask);
-	const avxf BC_k = shuffle(BC, permute_mask);
-
-	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
-	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
-
-	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
-	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
-
-	const avxf Sxy(Sy, Sx, Sy, Sx);
-
-	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
-	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
-
-	float ABBC_kz_array[8];
-	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
-
-	const float A_kz = ABBC_kz_array[0];
-	const float B_kz = ABBC_kz_array[2];
-	const float C_kz = ABBC_kz_array[6];
-
-	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
-	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
-
-	const avxf neg_mask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
-
-	/* W           U                             V
-	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
-	 */
-	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, neg_mask /* Dont care */);
-
-	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ neg_mask;
-
-	/* Calculate scaled barycentric coordinates. */
-	float WUVW_array[4];
-	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
-
-	const float W = WUVW_array[0];
-	const float U = WUVW_array[1];
-	const float V = WUVW_array[2];
-
-	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
-	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
-	                                               _mm256_setzero_ps(), 0));
-
-	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
-		return false;
-	}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
 #else
 	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
 	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
 	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
-	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
-	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
-	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
-
-	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
-	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
-	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
-
-	/* Perform shear and scale of vertices. */
-	const float Ax = A_kx - Sx * A_kz;
-	const float Ay = A_ky - Sy * A_kz;
-	const float Bx = B_kx - Sx * B_kz;
-	const float By = B_ky - Sy * B_kz;
-	const float Cx = C_kx - Sx * C_kz;
-	const float Cy = C_ky - Sy * C_kz;
-
-	/* Calculate scaled barycentric coordinates. */
-	float U = Cx * By - Cy * Bx;
-	float V = Ax * Cy - Ay * Cx;
-	float W = Bx * Ay - By * Ax;
-	if((U < 0.0f || V < 0.0f || W < 0.0f) &&
-	   (U > 0.0f || V > 0.0f || W > 0.0f))
-	{
-		return false;
-	}
 #endif
-
-	/* Calculate determinant. */
-	float det = U + V + W;
-	if(UNLIKELY(det == 0.0f)) {
-		return false;
-	}
-
-	/* Calculate scaled z-coordinates of vertices and use them to calculate
-	 * the hit distance.
-	 */
-	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
-	const int sign_det = (__float_as_int(det) & 0x80000000);
-	const float sign_T = xor_signmask(T, sign_det);
-	if((sign_T < 0.0f) ||
-	   (sign_T > isect->t * xor_signmask(det, sign_det)))
+	float t, u, v;
+	if(ray_triangle_intersect(P,
+	                          dir,
+	                          isect->t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                          ssef_verts,
+#else
+	                          float4_to_float3(tri_a),
+	                          float4_to_float3(tri_b),
+	                          float4_to_float3(tri_c),
+#endif
+	                          &u, &v, &t))
 	{
-		return false;
-	}
-
 #ifdef __VISIBILITY_FLAG__
-	/* visibility flag test. we do it here under the assumption
-	 * that most triangles are culled by node flags */
-	if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
+		/* Visibility flag test. we do it here under the assumption
+		 * that most triangles are culled by node flags.
+		 */
+		if(kernel_tex_fetch(__prim_visibility, prim_addr) & visibility)
 #endif
-	{
-#ifdef __KERNEL_CUDA__
-		if(A == B && B == C) {
-			return false;
+		{
+			isect->prim = prim_addr;
+			isect->object = object;
+			isect->type = PRIMITIVE_TRIANGLE;
+			isect->u = u;
+			isect->v = v;
+			isect->t = t;
+			return true;
 		}
-#endif
-		/* Normalize U, V, W, and T. */
-		const float inv_det = 1.0f / det;
-		isect->prim = prim_addr;
-		isect->object = object;
-		isect->type = PRIMITIVE_TRIANGLE;
-		isect->u = U * inv_det;
-		isect->v = V * inv_det;
-		isect->t = T * inv_det;
-		return true;
 	}
 	return false;
 }
@@ -260,138 +78,37 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
 #ifdef __SUBSURFACE__
 ccl_device_inline void triangle_intersect_subsurface(
         KernelGlobals *kg,
-        const IsectPrecalc *isect_precalc,
         SubsurfaceIntersection *ss_isect,
         float3 P,
+        float3 dir,
         int object,
         int prim_addr,
         float tmax,
         uint *lcg_state,
         int max_hits)
 {
-	const int kx = isect_precalc->kx;
-	const int ky = isect_precalc->ky;
-	const int kz = isect_precalc->kz;
-	const float Sx = isect_precalc->Sx;
-	const float Sy = isect_precalc->Sy;
-	const float Sz = isect_precalc->Sz;
-
-	/* Calculate vertices relative to ray origin. */
 	const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, prim_addr);
-	const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex+0),
-	             tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex+1),
-	             tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex+2);
-
-#if defined(__KERNEL_AVX2__) && defined(__KERNEL_SSE__)
-	const avxf avxf_P(P.m128, P.m128);
-
-	const avxf tri_ab = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 0);
-	const avxf tri_bc = kernel_tex_fetch_avxf(__prim_tri_verts, tri_vindex + 1);
-
-	const avxf AB = tri_ab - avxf_P;
-	const avxf BC = tri_bc - avxf_P;
-
-	const __m256i permuteMask = _mm256_set_epi32(0x3, kz, ky, kx, 0x3, kz, ky, kx);
-
-	const avxf AB_k = shuffle(AB, permuteMask);
-	const avxf BC_k = shuffle(BC, permuteMask);
-
-	/* Akz, Akz, Bkz, Bkz, Bkz, Bkz, Ckz, Ckz */
-	const avxf ABBC_kz = shuffle<2>(AB_k, BC_k);
-
-	/* Akx, Aky, Bkx, Bky, Bkx,Bky, Ckx, Cky */
-	const avxf ABBC_kxy = shuffle<0,1,0,1>(AB_k, BC_k);
-
-	const avxf Sxy(Sy, Sx, Sy, Sx);
-
-	/* Ax, Ay, Bx, By, Bx, By, Cx, Cy */
-	const avxf ABBC_xy = nmadd(ABBC_kz, Sxy, ABBC_kxy);
-
-	float ABBC_kz_array[8];
-	_mm256_storeu_ps((float*)&ABBC_kz_array, ABBC_kz);
-
-	const float A_kz = ABBC_kz_array[0];
-	const float B_kz = ABBC_kz_array[2];
-	const float C_kz = ABBC_kz_array[6];
-
-	/* By, Bx, Cy, Cx, By, Bx, Ay, Ax */
-	const avxf BCBA_yx = permute<3,2,7,6,3,2,1,0>(ABBC_xy);
-
-	const avxf negMask(0,0,0,0,0x80000000, 0x80000000, 0x80000000, 0x80000000);
-
-	/* W           U                             V
-	 * (AxBy-AyBx) (BxCy-ByCx) XX XX (BxBy-ByBx) (CxAy-CyAx) XX XX
-	 */
-	const avxf WUxxxxVxx_neg = _mm256_hsub_ps(ABBC_xy * BCBA_yx, negMask /* Dont care */);
-
-	const avxf WUVWnegWUVW = permute<0,1,5,0,0,1,5,0>(WUxxxxVxx_neg) ^ negMask;
-
-	/* Calculate scaled barycentric coordinates. */
-	float WUVW_array[4];
-	_mm_storeu_ps((float*)&WUVW_array, _mm256_castps256_ps128 (WUVWnegWUVW));
-
-	const float W = WUVW_array[0];
-	const float U = WUVW_array[1];
-	const float V = WUVW_array[2];
-
-	const int WUVW_mask = 0x7 & _mm256_movemask_ps(WUVWnegWUVW);
-	const int WUVW_zero = 0x7 & _mm256_movemask_ps(_mm256_cmp_ps(WUVWnegWUVW,
-	                                               _mm256_setzero_ps(), 0));
-
-	if(!((WUVW_mask == 7) || (WUVW_mask == 0)) && ((WUVW_mask | WUVW_zero) != 7)) {
-		return;
-	}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const ssef *ssef_verts = (ssef*)&kg->__prim_tri_verts.data[tri_vindex];
 #else
-	const float3 A = make_float3(tri_a.x - P.x, tri_a.y - P.y, tri_a.z - P.z);
-	const float3 B = make_float3(tri_b.x - P.x, tri_b.y - P.y, tri_b.z - P.z);
-	const float3 C = make_float3(tri_c.x - P.x, tri_c.y - P.y, tri_c.z - P.z);
-
-	const float A_kx = IDX(A, kx), A_ky = IDX(A, ky), A_kz = IDX(A, kz);
-	const float B_kx = IDX(B, kx), B_ky = IDX(B, ky), B_kz = IDX(B, kz);
-	const float C_kx = IDX(C, kx), C_ky = IDX(C, ky), C_kz = IDX(C, kz);
-
-	/* Perform shear and scale of vertices. */
-	const float Ax = A_kx - Sx * A_kz;
-	const float Ay = A_ky - Sy * A_kz;
-	const float Bx = B_kx - Sx * B_kz;
-	const float By = B_ky - Sy * B_kz;
-	const float Cx = C_kx - Sx * C_kz;
-	const float Cy = C_ky - Sy * C_kz;
-
-	/* Calculate scaled barycentric coordinates. */
-	float U = Cx * By - Cy * Bx;
-	float V = Ax * Cy - Ay * Cx;
-	float W = Bx * Ay - By * Ax;
-
-	if((U < 0.0f || V < 0.0f || W < 0.0f) &&
-	   (U > 0.0f || V > 0.0f || W > 0.0f))
-	{
-		return;
-	}
+	const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
+	             tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
+	             tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
 #endif
-
-	/* Calculate determinant. */
-	float det = U + V + W;
-	if(UNLIKELY(det == 0.0f)) {
-		return;
-	}
-
-	/* Calculate scaled z−coordinates of vertices and use them to calculate
-	 * the hit distance.
-	 */
-	const int sign_det = (__float_as_int(det) & 0x80000000);
-	const float T = (U * A_kz + V * B_kz + W * C_kz) * Sz;
-	const float sign_T = xor_signmask(T, sign_det);
-	if((sign_T < 0.0f) ||
-	   (sign_T > tmax * xor_signmask(det, sign_det)))
+	float t, u, v;
+	if(!ray_triangle_intersect(P,
+	                           dir,
+	                           tmax,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	                           ssef_verts,
+#else
+	                           tri_a, tri_b, tri_c,
+#endif
+	                           &u, &v, &t))
 	{
 		return;
 	}
 
-	/* Normalize U, V, W, and T. */
-	const float inv_det = 1.0f / det;
-
-	const float t = T * inv_det;
 	for(int i = min(max_hits, ss_isect->num_hits) - 1; i >= 0; --i) {
 		if(ss_isect->hits[i].t == t) {
 			return;
@@ -418,18 +135,19 @@ ccl_device_inline void triangle_intersect_subsurface(
 	isect->prim = prim_addr;
 	isect->object = object;
 	isect->type = PRIMITIVE_TRIANGLE;
-	isect->u = U * inv_det;
-	isect->v = V * inv_det;
+	isect->u = u;
+	isect->v = v;
 	isect->t = t;
 
 	/* Record geometric normal. */
-	/* TODO(sergey): Use float4_to_float3() on just an edges. */
-	const float3 v0 = float4_to_float3(tri_a);
-	const float3 v1 = float4_to_float3(tri_b);
-	const float3 v2 = float4_to_float3(tri_c);
-	ss_isect->Ng[hit] = normalize(cross(v1 - v0, v2 - v0));
-}
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+0)),
+	             tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+1)),
+	             tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex+2));
 #endif
+	ss_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+}
+#endif  /* __SUBSURFACE__ */
 
 /* Refine triangle intersection to more precise hit point. For rays that travel
  * far the precision is often not so good, this reintersects the primitive from
@@ -457,7 +175,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 			return P;
 		}
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
 #  endif
@@ -491,7 +209,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #  ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #  else
 		Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
 #  endif
@@ -519,7 +237,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_itfm);
+		Transform tfm = sd->ob_itfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -557,7 +275,7 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 
 	if(isect->object != OBJECT_NONE) {
 #ifdef __OBJECT_MOTION__
-		Transform tfm = ccl_fetch(sd, ob_tfm);
+		Transform tfm = sd->ob_tfm;
 #else
 		Transform tfm = object_fetch_transform(kg,
 		                                       isect->object,
@@ -570,6 +288,4 @@ ccl_device_inline float3 triangle_refine_subsurface(KernelGlobals *kg,
 	return P;
 }
 
-#undef IDX
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 03724c955be..1e0ef5201c9 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -64,7 +64,7 @@ ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
 
 ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_CUDA__
 #  if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
@@ -91,7 +91,7 @@ ccl_device float volume_attribute_float(KernelGlobals *kg, const ShaderData *sd,
 
 ccl_device float3 volume_attribute_float3(KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float3 *dx, float3 *dy)
 {
-	float3 P = volume_normalized_position(kg, sd, ccl_fetch(sd, P));
+	float3 P = volume_normalized_position(kg, sd, sd->P);
 #ifdef __KERNEL_CUDA__
 #  if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, desc.offset);
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 9279a94c13a..06c0fb2fbca 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -19,7 +19,8 @@
 
 /* CPU Kernel Interface */
 
-#include "util_types.h"
+#include "util/util_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -28,6 +29,7 @@ CCL_NAMESPACE_BEGIN
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
 struct KernelGlobals;
+struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
@@ -46,31 +48,31 @@ void kernel_tex_copy(KernelGlobals *kg,
                      ExtensionType extension = EXTENSION_REPEAT);
 
 #define KERNEL_ARCH cpu
-#include "kernels/cpu/kernel_cpu.h"
+#include "kernel/kernels/cpu/kernel_cpu.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #  define KERNEL_ARCH cpu_sse2
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #  define KERNEL_ARCH cpu_sse3
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #  define KERNEL_ARCH cpu_sse41
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 #  define KERNEL_ARCH cpu_avx
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #  define KERNEL_ARCH cpu_avx2
-#  include "kernels/cpu/kernel_cpu.h"
+#  include "kernel/kernels/cpu/kernel_cpu.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 6c3ee6b8098..823d30dde78 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -52,10 +52,17 @@ ccl_device_inline void bsdf_eval_init(BsdfEval *eval, ClosureType type, float3 v
 	{
 		eval->diffuse = value;
 	}
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis = make_float3(0.0f, 0.0f, 0.0f);
+#endif
 }
 
-ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value)
+ccl_device_inline void bsdf_eval_accum(BsdfEval *eval, ClosureType type, float3 value, float mis_weight)
 {
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis += value;
+#endif
+	value *= mis_weight;
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
 		if(CLOSURE_IS_BSDF_DIFFUSE(type))
@@ -96,7 +103,7 @@ ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 	}
 }
 
-ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -115,8 +122,19 @@ ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 	}
 }
 
+ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
+{
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis *= value;
+#endif
+	bsdf_eval_mis(eval, value);
+}
+
 ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
+#ifdef __SHADOW_TRICKS__
+	eval->sum_no_mis *= value;
+#endif
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
 		eval->diffuse *= value;
@@ -134,7 +152,7 @@ ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 #endif
 }
 
-ccl_device_inline float3 bsdf_eval_sum(BsdfEval *eval)
+ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
 #ifdef __PASSES__
 	if(eval->use_light_pass) {
@@ -198,6 +216,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 	{
 		L->emission = make_float3(0.0f, 0.0f, 0.0f);
 	}
+
+#ifdef __SHADOW_TRICKS__
+	L->path_total = make_float3(0.0f, 0.0f, 0.0f);
+	L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
+	L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
+#endif
 }
 
 ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -252,7 +276,12 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
 	}
 }
 
-ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput, float3 alpha, float3 bsdf, float3 ao, int bounce)
+ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+                                              float3 throughput,
+                                              float3 alpha,
+                                              float3 bsdf,
+                                              float3 ao,
+                                              int bounce)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
@@ -271,6 +300,26 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L, float3 throughput
 	{
 		L->emission += throughput*bsdf*ao;
 	}
+
+#ifdef __SHADOW_TRICKS__
+	float3 light = throughput * bsdf;
+	L->path_total += light;
+	L->path_total_shaded += ao * light;
+#endif
+}
+
+ccl_device_inline void path_radiance_accum_total_ao(
+        PathRadiance *L,
+        float3 throughput,
+        float3 bsdf)
+{
+#ifdef __SHADOW_TRICKS__
+	L->path_total += throughput * bsdf;
+#else
+	(void) L;
+	(void) throughput;
+	(void) bsdf;
+#endif
 }
 
 ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
@@ -301,15 +350,38 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 	{
 		L->emission += throughput*bsdf_eval->diffuse*shadow;
 	}
+
+#ifdef __SHADOW_TRICKS__
+	float3 light = throughput * bsdf_eval->sum_no_mis;
+	L->path_total += light;
+	L->path_total_shaded += shadow * light;
+#endif
+}
+
+ccl_device_inline void path_radiance_accum_total_light(
+        PathRadiance *L,
+        float3 throughput,
+        const BsdfEval *bsdf_eval)
+{
+#ifdef __SHADOW_TRICKS__
+	L->path_total += throughput * bsdf_eval->sum_no_mis;
+#else
+	(void) L;
+	(void) throughput;
+	(void) bsdf_eval;
+#endif
 }
 
-ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 throughput, float3 value, int bounce)
+ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
+                                                      ccl_addr_space PathState *state,
+                                                      float3 throughput,
+                                                      float3 value)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0)
+		if(state->bounce == 0)
 			L->background += throughput*value;
-		else if(bounce == 1)
+		else if(state->bounce == 1)
 			L->direct_emission += throughput*value;
 		else
 			L->indirect += throughput*value;
@@ -319,6 +391,13 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L, float3 th
 	{
 		L->emission += throughput*value;
 	}
+
+#ifdef __SHADOW_TRICKS__
+	L->path_total += throughput * value;
+	if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+		L->path_total_shaded += throughput * value;
+	}
+#endif
 }
 
 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -399,7 +478,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 		float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
 
 		/* Reject invalid value */
-		if(!isfinite(sum)) {
+		if(!isfinite_safe(sum)) {
 			kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
 			L_sum = make_float3(0.0f, 0.0f, 0.0f);
 
@@ -468,7 +547,7 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 
 	/* Reject invalid value */
 	float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-	if(!isfinite(sum)) {
+	if(!isfinite_safe(sum)) {
 		kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
 		L_sum = make_float3(0.0f, 0.0f, 0.0f);
 	}
@@ -501,5 +580,34 @@ ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance
 	L->emission += L_sample->emission * fac;
 }
 
-CCL_NAMESPACE_END
+#ifdef __SHADOW_TRICKS__
+/* Calculate current shadow of the path. */
+ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
+{
+	float path_total = average(L->path_total);
+	float path_total_shaded = average(L->path_total_shaded);
+	if(path_total != 0.0f) {
+		return path_total_shaded / path_total;
+	}
+	return 1.0f;
+}
 
+/* Calculate final light sum and transparency for shadow catcher object. */
+ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
+                                                         const PathRadiance *L,
+                                                         ccl_addr_space float* L_transparent)
+{
+	const float shadow = path_radiance_sum_shadow(L);
+	float3 L_sum;
+	if(kernel_data.background.transparent) {
+		*L_transparent = shadow;
+		L_sum = make_float3(0.0f, 0.0f, 0.0f);
+	}
+	else {
+		L_sum = L->shadow_color * shadow;
+	}
+	return L_sum;
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 5bcc57cdcdf..f18d145f7cf 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -54,7 +54,8 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
 	float rbsdf = path_state_rng_1D(kg, &rng, &state, PRNG_BSDF);
 	shader_eval_surface(kg, sd, &rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
-	/* TODO, disable the closures we won't need */
+	/* TODO, disable more closures we don't need besides transparent */
+	shader_bsdf_disable_transparency(kg, sd);
 
 #ifdef __BRANCHED_PATH__
 	if(!kernel_data.integrator.branched) {
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index dedac6b1465..0df5217d97a 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -457,7 +457,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
 	if(kernel_data.cam.type != CAMERA_PANORAMA) {
 		/* perspective / ortho */
-		if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+		if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
 			P += camera_position(kg);
 
 		Transform tfm = kernel_data.cam.worldtondc;
@@ -467,7 +467,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 		/* panorama */
 		Transform tfm = kernel_data.cam.worldtocamera;
 
-		if(ccl_fetch(sd, object) != OBJECT_NONE)
+		if(sd->object != OBJECT_NONE)
 			P = normalize(transform_point(&tfm, P));
 		else
 			P = normalize(transform_direction(&tfm, P));
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/kernel_compat_cpu.h
index 9d1f3bdc918..ae7c9b836c4 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -35,15 +35,24 @@
 #  define __NODES_FEATURES__ NODE_FEATURE_ALL
 #endif
 
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_simd.h"
-#include "util_half.h"
-#include "util_types.h"
-#include "util_texture.h"
+#include "util/util_debug.h"
+#include "util/util_math.h"
+#include "util/util_simd.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+#include "util/util_texture.h"
 
 #define ccl_addr_space
 
+#define ccl_local_id(d) 0
+#define ccl_global_id(d) (kg->global_id[d])
+
+#define ccl_local_size(d) 1
+#define ccl_global_size(d) (kg->global_size[d])
+
+#define ccl_group_id(d) ccl_global_id(d)
+#define ccl_num_groups(d) ccl_global_size(d)
+
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
@@ -78,9 +87,9 @@ template<typename T> struct texture  {
 	ccl_always_inline avxf fetch_avxf(const int index)
 	{
 		kernel_assert(index >= 0 && (index+1) < width);
-		ssef *ssefData = (ssef*)data;
-		ssef *ssefNodeData = &ssefData[index];
-		return _mm256_loadu_ps((float *)ssefNodeData);
+		ssef *ssef_data = (ssef*)data;
+		ssef *ssef_node_data = &ssef_data[index];
+		return _mm256_loadu_ps((float *)ssef_node_data);
 	}
 
 #endif
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index e0c7b17c6a0..39e98c7dda6 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -38,7 +38,7 @@
 
 #define ccl_device  __device__ __inline__
 #  define ccl_device_forceinline  __device__ __forceinline__
-#if (__KERNEL_CUDA_VERSION__ == 80) && (__CUDA_ARCH__ < 500)
+#if __CUDA_ARCH__ < 500
 #  define ccl_device_inline  __device__ __forceinline__
 #else
 #  define ccl_device_inline  __device__ __inline__
@@ -46,6 +46,9 @@
 #define ccl_device_noinline  __device__ __noinline__
 #define ccl_global
 #define ccl_constant
+#define ccl_local __shared__
+#define ccl_local_param
+#define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
@@ -57,8 +60,54 @@
 
 /* Types */
 
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
+
+/* Work item functions */
+
+ccl_device_inline uint ccl_local_id(uint d)
+{
+	switch(d) {
+		case 0: return threadIdx.x;
+		case 1: return threadIdx.y;
+		case 2: return threadIdx.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+
+ccl_device_inline uint ccl_local_size(uint d)
+{
+	switch(d) {
+		case 0: return blockDim.x;
+		case 1: return blockDim.y;
+		case 2: return blockDim.z;
+		default: return 0;
+	}
+}
+
+#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+
+ccl_device_inline uint ccl_group_id(uint d)
+{
+	switch(d) {
+		case 0: return blockIdx.x;
+		case 1: return blockIdx.y;
+		case 2: return blockIdx.z;
+		default: return 0;
+	}
+}
+
+ccl_device_inline uint ccl_num_groups(uint d)
+{
+	switch(d) {
+		case 0: return gridDim.x;
+		case 1: return gridDim.y;
+		case 2: return gridDim.z;
+		default: return 0;
+	}
+}
 
 /* Textures */
 
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index f076e3a7d37..c2263ac0d49 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -39,6 +39,7 @@
 #define ccl_constant __constant
 #define ccl_global __global
 #define ccl_local __local
+#define ccl_local_param __local
 #define ccl_private __private
 #define ccl_restrict restrict
 #define ccl_align(n) __attribute__((aligned(n)))
@@ -49,6 +50,15 @@
 #  define ccl_addr_space
 #endif
 
+#define ccl_local_id(d) get_local_id(d)
+#define ccl_global_id(d) get_global_id(d)
+
+#define ccl_local_size(d) get_local_size(d)
+#define ccl_global_size(d) get_global_size(d)
+
+#define ccl_group_id(d) get_group_id(d)
+#define ccl_num_groups(d) get_num_groups(d)
+
 /* Selective nodes compilation. */
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
@@ -133,8 +143,8 @@
 /* define NULL */
 #define NULL 0
 
-#include "util_half.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_types.h"
 
 #endif /* __KERNEL_COMPAT_OPENCL_H__ */
 
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 8c7c651a053..9e7d51f23f5 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -67,7 +67,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		                         ls->shader, ls->object, ls->prim,
 		                         ls->u, ls->v, t, time, false, ls->lamp);
 
-		ls->Ng = ccl_fetch(emission_sd, Ng);
+		ls->Ng = emission_sd->Ng;
 
 		/* no path flag, we're evaluating this for all closures. that's weak but
 		 * we'd have to do multiple evaluations otherwise */
@@ -76,7 +76,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
 		path_state_modify_bounce(state, false);
 
 		/* evaluate emissive closure */
-		if(ccl_fetch(emission_sd, flag) & SD_EMISSION)
+		if(emission_sd->flag & SD_EMISSION)
 			eval = shader_emissive_eval(kg, emission_sd);
 		else
 			eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -112,7 +112,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	                                         -ls->D,
 	                                         dD,
 	                                         ls->t,
-	                                         ccl_fetch(sd, time));
+	                                         sd->time);
 
 	if(is_zero(light_eval))
 		return false;
@@ -120,7 +120,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	/* evaluate BSDF at shading point */
 
 #ifdef __VOLUME__
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 		shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
 	else {
 		float bsdf_pdf;
@@ -156,8 +156,13 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 	if(bsdf_eval_is_zero(eval))
 		return false;
 
-	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-		float probability = max3(bsdf_eval_sum(eval)) * kernel_data.integrator.light_inv_rr_threshold;
+	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f
+#ifdef __SHADOW_TRICKS__
+	   && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
+#endif
+	  )
+	{
+		float probability = max3(fabs(bsdf_eval_sum(eval))) * kernel_data.integrator.light_inv_rr_threshold;
 		if(probability < 1.0f) {
 			if(rand_terminate >= probability) {
 				return false;
@@ -168,8 +173,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 
 	if(ls->shader & SHADER_CAST_SHADOW) {
 		/* setup ray */
-		bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
-		ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
+		ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
 
 		if(ls->t == FLT_MAX) {
 			/* distant light */
@@ -182,7 +187,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg,
 			ray->D = normalize_len(ray->D, &ray->t);
 		}
 
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = differential3_zero();
 	}
 	else {
@@ -204,14 +209,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 	float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
 #else
-	if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
+	if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
 #endif
 	{
 		/* multiple importance sampling, get triangle light pdf,
 		 * and compute weight with respect to BSDF pdf */
-		float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
+		float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
 		float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
 		return L*mis_weight;
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 8e66a3a0340..c9c97ea977e 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -16,6 +16,9 @@
 
 /* Constant Globals */
 
+#ifndef __KERNEL_GLOBALS_H__
+#define __KERNEL_GLOBALS_H__
+
 CCL_NAMESPACE_BEGIN
 
 /* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
@@ -44,7 +47,7 @@ typedef struct KernelGlobals {
 
 #  define KERNEL_TEX(type, ttype, name) ttype name;
 #  define KERNEL_IMAGE_TEX(type, ttype, name)
-#  include "kernel_textures.h"
+#  include "kernel/kernel_textures.h"
 
 	KernelData __data;
 
@@ -64,6 +67,13 @@ typedef struct KernelGlobals {
 	/* Storage for decoupled volume steps. */
 	VolumeStep *decoupled_volume_steps[2];
 	int decoupled_volume_steps_index;
+
+	/* split kernel */
+	SplitData split_data;
+	SplitParams split_param_data;
+
+	int2 global_size;
+	int2 global_id;
 } KernelGlobals;
 
 #endif  /* __KERNEL_CPU__ */
@@ -76,7 +86,10 @@ typedef struct KernelGlobals {
 #ifdef __KERNEL_CUDA__
 
 __constant__ KernelData __data;
-typedef struct KernelGlobals {} KernelGlobals;
+typedef struct KernelGlobals {
+	/* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+	Intersection hits_stack[64];
+} KernelGlobals;
 
 #  ifdef __KERNEL_CUDA_TEX_STORAGE__
 #    define KERNEL_TEX(type, ttype, name) ttype name;
@@ -84,7 +97,7 @@ typedef struct KernelGlobals {} KernelGlobals;
 #    define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
 #  endif
 #  define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
-#  include "kernel_textures.h"
+#  include "kernel/kernel_textures.h"
 
 #endif  /* __KERNEL_CUDA__ */
 
@@ -97,11 +110,11 @@ typedef ccl_addr_space struct KernelGlobals {
 
 #  define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name;
-#  include "kernel_textures.h"
+#  include "kernel/kernel_textures.h"
 
 #  ifdef __SPLIT_KERNEL__
-	ShaderData *sd_input;
-	Intersection *isect_shadow;
+	SplitData split_data;
+	SplitParams split_param_data;
 #  endif
 } KernelGlobals;
 
@@ -143,3 +156,4 @@ ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float y, int o
 
 CCL_NAMESPACE_END
 
+#endif  /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 9bee5603474..bd0e23b7705 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -17,11 +17,11 @@
 #ifndef __KERNEL_MATH_H__
 #define __KERNEL_MATH_H__
 
-#include "util_color.h"
-#include "util_math.h"
-#include "util_math_fast.h"
-#include "util_texture.h"
-#include "util_transform.h"
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_math_intersect.h"
+#include "util/util_texture.h"
+#include "util/util_transform.h"
 
 #endif /* __KERNEL_MATH_H__ */
-
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 7aec47e4957..ed523696571 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -19,16 +19,16 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, int sample, float value)
 {
 	ccl_global float *buf = buffer;
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	atomic_add_and_fetch_float(buf, value);
 #else
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sample, float3 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -39,12 +39,12 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, int sa
 #else
 	ccl_global float3 *buf = (ccl_global float3*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sample, float4 value)
 {
-#if defined(__SPLIT_KERNEL__) && defined(__WORK_STEALING__)
+#if defined(__SPLIT_KERNEL__)
 	ccl_global float *buf_x = buffer + 0;
 	ccl_global float *buf_y = buffer + 1;
 	ccl_global float *buf_z = buffer + 2;
@@ -57,7 +57,7 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 #else
 	ccl_global float4 *buf = (ccl_global float4*)buffer;
 	*buf = (sample == 0)? value: *buf + value;
-#endif // __SPLIT_KERNEL__ && __WORK_STEALING__
+#endif  /* __SPLIT_KERNEL__ */
 }
 
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
@@ -75,18 +75,18 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		return;
 	
 	if(!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-		if(!(ccl_fetch(sd, flag) & SD_TRANSPARENT) ||
+		if(!(sd->flag & SD_TRANSPARENT) ||
 		   kernel_data.film.pass_alpha_threshold == 0.0f ||
 		   average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold)
 		{
 
 			if(sample == 0) {
 				if(flag & PASS_DEPTH) {
-					float depth = camera_distance(kg, ccl_fetch(sd, P));
+					float depth = camera_distance(kg, sd->P);
 					kernel_write_pass_float(buffer + kernel_data.film.pass_depth, sample, depth);
 				}
 				if(flag & PASS_OBJECT_ID) {
-					float id = object_pass_id(kg, ccl_fetch(sd, object));
+					float id = object_pass_id(kg, sd->object);
 					kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, sample, id);
 				}
 				if(flag & PASS_MATERIAL_ID) {
@@ -96,7 +96,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 			}
 
 			if(flag & PASS_NORMAL) {
-				float3 normal = ccl_fetch(sd, N);
+				float3 normal = sd->N;
 				kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, sample, normal);
 			}
 			if(flag & PASS_UV) {
@@ -127,7 +127,7 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		float mist_start = kernel_data.film.mist_start;
 		float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-		float depth = camera_distance(kg, ccl_fetch(sd, P));
+		float depth = camera_distance(kg, sd->P);
 		float mist = saturate((depth - mist_start)*mist_inv_depth);
 
 		/* falloff */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index f90701a8260..e7957042182 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -15,40 +15,41 @@
  */
 
 #ifdef __OSL__
-#  include "osl_shader.h"
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_camera.h"
 
-#include "geom/geom.h"
-#include "bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
 
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
 
 #ifdef __SUBSURFACE__
-#  include "kernel_subsurface.h"
+#  include "kernel/kernel_subsurface.h"
 #endif
 
 #ifdef __VOLUME__
-#  include "kernel_volume.h"
+#  include "kernel/kernel_volume.h"
 #endif
 
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shadow.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_path_common.h"
+#include "kernel/kernel_path_surface.h"
+#include "kernel/kernel_path_volume.h"
+#include "kernel/kernel_path_subsurface.h"
 
 #ifdef __KERNEL_DEBUG__
-#  include "kernel_debug.h"
+#  include "kernel/kernel_debug.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -75,22 +76,25 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 
 	sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-	if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+	if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 		Ray light_ray;
 		float3 ao_shadow;
 
-		light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+		light_ray.P = ray_offset(sd->P, sd->Ng);
 		light_ray.D = ao_D;
 		light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-		light_ray.time = ccl_fetch(sd, time);
+		light_ray.time = sd->time;
 #endif  /* __OBJECT_MOTION__ */
-		light_ray.dP = ccl_fetch(sd, dP);
+		light_ray.dP = sd->dP;
 		light_ray.dD = differential3_zero();
 
 		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
 			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
 		}
+		else {
+			path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+		}
 	}
 }
 
@@ -289,9 +293,9 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, emission_sd, state, ray);
 			path_radiance_accum_background(L,
+			                               state,
 			                               throughput,
-			                               L_background,
-			                               state->bounce);
+			                               L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -311,6 +315,12 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 		shader_merge_closures(sd);
 #endif  /* __BRANCHED_PATH__ */
 
+#ifdef __SHADOW_TRICKS__
+		if(!(sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+			state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
+		}
+#endif  /* __SHADOW_TRICKS__ */
+
 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
 		if(kernel_data.integrator.filter_glossy != FLT_MAX) {
@@ -373,7 +383,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
 			/* do bssrdf scatter step if we picked a bssrdf closure */
 			if(sc) {
-				uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+				uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
 
 				float bssrdf_u, bssrdf_v;
 				path_state_rng_2D(kg,
@@ -395,7 +405,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 
 #if defined(__EMISSION__) && defined(__BRANCHED_PATH__)
 		if(kernel_data.integrator.use_direct_light) {
-			int all = kernel_data.integrator.sample_all_lights_indirect;
+			int all = (kernel_data.integrator.sample_all_lights_indirect) ||
+			          (state->flag & PATH_RAY_SHADOW_CATCHER);
 			kernel_branched_path_surface_connect_light(kg,
 			                                           rng,
 			                                           sd,
@@ -413,172 +424,6 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 	}
 }
 
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-bool kernel_path_subsurface_scatter(
-        KernelGlobals *kg,
-        ShaderData *sd,
-        ShaderData *emission_sd,
-        PathRadiance *L,
-        PathState *state,
-        RNG *rng,
-        Ray *ray,
-        float3 *throughput,
-        SubsurfaceIndirectRays *ss_indirect)
-{
-	float bssrdf_probability;
-	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
-
-	/* modify throughput for picking bssrdf or bsdf */
-	*throughput *= bssrdf_probability;
-
-	/* do bssrdf scatter step if we picked a bssrdf closure */
-	if(sc) {
-		/* We should never have two consecutive BSSRDF bounces,
-		 * the second one should be converted to a diffuse BSDF to
-		 * avoid this.
-		 */
-		kernel_assert(!ss_indirect->tracing);
-
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
-
-		SubsurfaceIntersection ss_isect;
-		float bssrdf_u, bssrdf_v;
-		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-		int num_hits = subsurface_scatter_multi_intersect(kg,
-		                                                  &ss_isect,
-		                                                  sd,
-		                                                  sc,
-		                                                  &lcg_state,
-		                                                  bssrdf_u, bssrdf_v,
-		                                                  false);
-#  ifdef __VOLUME__
-		ss_indirect->need_update_volume_stack =
-		        kernel_data.integrator.use_volumes &&
-		        ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif  /* __VOLUME__ */
-
-		/* compute lighting with the BSDF closure */
-		for(int hit = 0; hit < num_hits; hit++) {
-			/* NOTE: We reuse the existing ShaderData, we assume the path
-			 * integration loop stops when this function returns true.
-			 */
-			subsurface_scatter_multi_setup(kg,
-			                               &ss_isect,
-			                               hit,
-			                               sd,
-			                               state,
-			                               state->flag,
-			                               sc,
-			                               false);
-
-			PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
-			Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-			float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-			PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays];
-
-			*hit_state = *state;
-			*hit_ray = *ray;
-			*hit_tp = *throughput;
-
-			hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-			path_radiance_init(hit_L, kernel_data.film.use_light_pass);
-			hit_L->direct_throughput = L->direct_throughput;
-			path_radiance_copy_indirect(hit_L, L);
-
-			kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
-
-			if(kernel_path_surface_bounce(kg,
-			                              rng,
-			                              sd,
-			                              hit_tp,
-			                              hit_state,
-			                              hit_L,
-			                              hit_ray))
-			{
-#  ifdef __LAMP_MIS__
-				hit_state->ray_t = 0.0f;
-#  endif  /* __LAMP_MIS__ */
-
-#  ifdef __VOLUME__
-				if(ss_indirect->need_update_volume_stack) {
-					Ray volume_ray = *ray;
-					/* Setup ray from previous surface point to the new one. */
-					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
-					                             &volume_ray.t);
-
-					kernel_volume_stack_update_for_subsurface(
-					    kg,
-					    emission_sd,
-					    &volume_ray,
-					    hit_state->volume_stack);
-				}
-#  endif  /* __VOLUME__ */
-				path_radiance_reset_indirect(L);
-				ss_indirect->num_rays++;
-			}
-			else {
-				path_radiance_accum_sample(L, hit_L, 1);
-			}
-		}
-		return true;
-	}
-	return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
-        SubsurfaceIndirectRays *ss_indirect)
-{
-	ss_indirect->tracing = false;
-	ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_accum_indirect(
-        SubsurfaceIndirectRays *ss_indirect,
-        PathRadiance *L)
-{
-	if(ss_indirect->tracing) {
-		path_radiance_sum_indirect(L);
-		path_radiance_accum_sample(&ss_indirect->direct_L, L, 1);
-		if(ss_indirect->num_rays == 0) {
-			*L = ss_indirect->direct_L;
-		}
-	}
-}
-
-ccl_device void kernel_path_subsurface_setup_indirect(
-        KernelGlobals *kg,
-        SubsurfaceIndirectRays *ss_indirect,
-        PathState *state,
-        Ray *ray,
-        PathRadiance *L,
-        float3 *throughput)
-{
-	if(!ss_indirect->tracing) {
-		ss_indirect->direct_L = *L;
-	}
-	ss_indirect->tracing = true;
-
-	/* Setup state, ray and throughput for indirect SSS rays. */
-	ss_indirect->num_rays--;
-
-	Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
-	PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays];
-
-	*state = ss_indirect->state[ss_indirect->num_rays];
-	*ray = *indirect_ray;
-	*L = *indirect_L;
-	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
-	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif  /* __SUBSURFACE__ */
 
 ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
                                                RNG *rng,
@@ -631,7 +476,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			}
 
 			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
+			lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d);
 		}
 
 		if(state.bounce > kernel_data.integrator.ao_bounces) {
@@ -776,7 +621,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
+			path_radiance_accum_background(&L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -790,6 +635,21 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		float rbsdf = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_BSDF);
 		shader_eval_surface(kg, &sd, rng, &state, rbsdf, state.flag, SHADER_CONTEXT_MAIN);
 
+#ifdef __SHADOW_TRICKS__
+		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+			if(state.flag & PATH_RAY_CAMERA) {
+				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.catcher_object = sd.object;
+				if(!kernel_data.background.transparent) {
+					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+				}
+			}
+		}
+		else {
+			state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
+		}
+#endif  /* __SHADOW_TRICKS__ */
+
 		/* holdout */
 #ifdef __HOLDOUT__
 		if(((sd.flag & SD_HOLDOUT) ||
@@ -907,7 +767,16 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 	}
 #endif  /* __SUBSURFACE__ */
 
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
+	float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
+		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
+	}
+	else
+#endif  /* __SHADOW_TRICKS__ */
+	{
+		L_sum = path_radiance_clamp_and_sum(kg, &L);
+	}
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
 
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index ff2b828795d..36fd6c95fe7 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -42,21 +42,25 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 
 		sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-		if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+		if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 			Ray light_ray;
 			float3 ao_shadow;
 
-			light_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+			light_ray.P = ray_offset(sd->P, sd->Ng);
 			light_ray.D = ao_D;
 			light_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-			light_ray.time = ccl_fetch(sd, time);
+			light_ray.time = sd->time;
 #endif  /* __OBJECT_MOTION__ */
-			light_ray.dP = ccl_fetch(sd, dP);
+			light_ray.dP = sd->dP;
 			light_ray.dD = differential3_zero();
 
-			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow))
+			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
 				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+			}
+			else {
+				path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+			}
 		}
 	}
 }
@@ -67,8 +71,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSDF(sc->type))
 			continue;
@@ -140,14 +144,14 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
                                                         Ray *ray,
                                                         float3 throughput)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = &ccl_fetch(sd, closure)[i];
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(!CLOSURE_IS_BSSRDF(sc->type))
 			continue;
 
 		/* set up random number generator */
-		uint lcg_state = lcg_state_init(rng, state, 0x68bc21eb);
+		uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
 		int num_samples = kernel_data.integrator.subsurface_samples;
 		float num_samples_inv = 1.0f/num_samples;
 		RNG bssrdf_rng = cmj_hash(*rng, i);
@@ -169,7 +173,7 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 			Ray volume_ray = *ray;
 			bool need_update_volume_stack =
 			        kernel_data.integrator.use_volumes &&
-			        ccl_fetch(sd, object_flag) & SD_OBJECT_INTERSECTS_VOLUME;
+			        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
 #endif  /* __VOLUME__ */
 
 			/* compute lighting with the BSDF closure */
@@ -206,7 +210,8 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 #ifdef __EMISSION__
 				/* direct light */
 				if(kernel_data.integrator.use_direct_light) {
-					int all = kernel_data.integrator.sample_all_lights_direct;
+					int all = (kernel_data.integrator.sample_all_lights_direct) ||
+					          (state->flag & PATH_RAY_SHADOW_CATCHER);
 					kernel_branched_path_surface_connect_light(
 					        kg,
 					        rng,
@@ -280,7 +285,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}
 
 			extmax = kernel_data.curve.maximum_width;
-			lcg_state = lcg_state_init(rng, &state, 0x51633e2d);
+			lcg_state = lcg_state_init(rng, state.rng_offset, state.sample, 0x51633e2d);
 		}
 
 		bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
@@ -461,7 +466,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, throughput, L_background, state.bounce);
+			path_radiance_accum_background(&L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */
 
 			break;
@@ -472,6 +477,21 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		shader_eval_surface(kg, &sd, rng, &state, 0.0f, state.flag, SHADER_CONTEXT_MAIN);
 		shader_merge_closures(&sd);
 
+#ifdef __SHADOW_TRICKS__
+		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+			if(state.flag & PATH_RAY_CAMERA) {
+				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.catcher_object = sd.object;
+				if(!kernel_data.background.transparent) {
+					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+				}
+			}
+		}
+		else {
+			state.flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
+		}
+#endif  /* __SHADOW_TRICKS__ */
+
 		/* holdout */
 #ifdef __HOLDOUT__
 		if((sd.flag & SD_HOLDOUT) || (sd.object_flag & SD_OBJECT_HOLDOUT_MASK)) {
@@ -544,7 +564,8 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __EMISSION__
 			/* direct light */
 			if(kernel_data.integrator.use_direct_light) {
-				int all = kernel_data.integrator.sample_all_lights_direct;
+				int all = (kernel_data.integrator.sample_all_lights_direct) ||
+				          (state.flag & PATH_RAY_SHADOW_CATCHER);
 				kernel_branched_path_surface_connect_light(kg, rng,
 					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
 			}
@@ -581,7 +602,16 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #endif  /* __VOLUME__ */
 	}
 
-	float3 L_sum = path_radiance_clamp_and_sum(kg, &L);
+	float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
+		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
+	}
+	else
+#endif  /* __SHADOW_TRICKS__ */
+	{
+		L_sum = path_radiance_clamp_and_sum(kg, &L);
+	}
 
 	kernel_write_light_passes(kg, buffer, &L, sample);
 
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
index 7b903556bf9..82f83deb595 100644
--- a/intern/cycles/kernel/kernel_path_common.h
+++ b/intern/cycles/kernel/kernel_path_common.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "util_hash.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -22,7 +22,7 @@ ccl_device_inline void kernel_path_trace_setup(KernelGlobals *kg,
                                                ccl_global uint *rng_state,
                                                int sample,
                                                int x, int y,
-                                               ccl_addr_space RNG *rng,
+                                               RNG *rng,
                                                ccl_addr_space Ray *ray)
 {
 	float filter_u;
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index 661dc52fb31..c0cd2a63120 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init(KernelGlobals *kg,
                                        ShaderData *stack_sd,
                                        ccl_addr_space PathState *state,
-                                       ccl_addr_space RNG *rng,
+                                       RNG *rng,
                                        int sample,
                                        ccl_addr_space Ray *ray)
 {
@@ -54,6 +54,10 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 		state->volume_stack[0].shader = SHADER_NONE;
 	}
 #endif
+
+#ifdef __SHADOW_TRICKS__
+	state->catcher_object = OBJECT_NONE;
+#endif
 }
 
 ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathState *state, int label)
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
new file mode 100644
index 00000000000..10b568ac3dd
--- /dev/null
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+#  ifndef __KERNEL_CUDA__
+ccl_device
+#  else
+ccl_device_inline
+#  endif
+bool kernel_path_subsurface_scatter(
+        KernelGlobals *kg,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        PathRadiance *L,
+        ccl_addr_space PathState *state,
+        RNG *rng,
+        ccl_addr_space Ray *ray,
+        ccl_addr_space float3 *throughput,
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
+{
+	float bssrdf_probability;
+	ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+
+	/* modify throughput for picking bssrdf or bsdf */
+	*throughput *= bssrdf_probability;
+
+	/* do bssrdf scatter step if we picked a bssrdf closure */
+	if(sc) {
+		/* We should never have two consecutive BSSRDF bounces,
+		 * the second one should be converted to a diffuse BSDF to
+		 * avoid this.
+		 */
+		kernel_assert(!ss_indirect->tracing);
+
+		uint lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0x68bc21eb);
+
+		SubsurfaceIntersection ss_isect;
+		float bssrdf_u, bssrdf_v;
+		path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+		int num_hits = subsurface_scatter_multi_intersect(kg,
+		                                                  &ss_isect,
+		                                                  sd,
+		                                                  sc,
+		                                                  &lcg_state,
+		                                                  bssrdf_u, bssrdf_v,
+		                                                  false);
+#  ifdef __VOLUME__
+		ss_indirect->need_update_volume_stack =
+		        kernel_data.integrator.use_volumes &&
+		        sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
+#  endif  /* __VOLUME__ */
+
+		/* compute lighting with the BSDF closure */
+		for(int hit = 0; hit < num_hits; hit++) {
+			/* NOTE: We reuse the existing ShaderData, we assume the path
+			 * integration loop stops when this function returns true.
+			 */
+			subsurface_scatter_multi_setup(kg,
+			                               &ss_isect,
+			                               hit,
+			                               sd,
+			                               state,
+			                               state->flag,
+			                               sc,
+			                               false);
+
+			ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
+			ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
+			ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
+			PathRadiance *hit_L = &ss_indirect->L[ss_indirect->num_rays];
+
+			*hit_state = *state;
+			*hit_ray = *ray;
+			*hit_tp = *throughput;
+
+			hit_state->rng_offset += PRNG_BOUNCE_NUM;
+
+			path_radiance_init(hit_L, kernel_data.film.use_light_pass);
+			hit_L->direct_throughput = L->direct_throughput;
+			path_radiance_copy_indirect(hit_L, L);
+
+			kernel_path_surface_connect_light(kg, rng, sd, emission_sd, *hit_tp, state, hit_L);
+
+			if(kernel_path_surface_bounce(kg,
+			                              rng,
+			                              sd,
+			                              hit_tp,
+			                              hit_state,
+			                              hit_L,
+			                              hit_ray))
+			{
+#  ifdef __LAMP_MIS__
+				hit_state->ray_t = 0.0f;
+#  endif  /* __LAMP_MIS__ */
+
+#  ifdef __VOLUME__
+				if(ss_indirect->need_update_volume_stack) {
+					Ray volume_ray = *ray;
+					/* Setup ray from previous surface point to the new one. */
+					volume_ray.D = normalize_len(hit_ray->P - volume_ray.P,
+					                             &volume_ray.t);
+
+					kernel_volume_stack_update_for_subsurface(
+					    kg,
+					    emission_sd,
+					    &volume_ray,
+					    hit_state->volume_stack);
+				}
+#  endif  /* __VOLUME__ */
+				path_radiance_reset_indirect(L);
+				ss_indirect->num_rays++;
+			}
+			else {
+				path_radiance_accum_sample(L, hit_L, 1);
+			}
+		}
+		return true;
+	}
+	return false;
+}
+
+ccl_device_inline void kernel_path_subsurface_init_indirect(
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
+{
+	ss_indirect->tracing = false;
+	ss_indirect->num_rays = 0;
+}
+
+ccl_device void kernel_path_subsurface_accum_indirect(
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
+        PathRadiance *L)
+{
+	if(ss_indirect->tracing) {
+		path_radiance_sum_indirect(L);
+		path_radiance_accum_sample(&ss_indirect->direct_L, L, 1);
+		if(ss_indirect->num_rays == 0) {
+			*L = ss_indirect->direct_L;
+		}
+	}
+}
+
+ccl_device void kernel_path_subsurface_setup_indirect(
+        KernelGlobals *kg,
+        ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
+        ccl_addr_space PathState *state,
+        ccl_addr_space Ray *ray,
+        PathRadiance *L,
+        ccl_addr_space float3 *throughput)
+{
+	if(!ss_indirect->tracing) {
+		ss_indirect->direct_L = *L;
+	}
+	ss_indirect->tracing = true;
+
+	/* Setup state, ray and throughput for indirect SSS rays. */
+	ss_indirect->num_rays--;
+
+	ccl_addr_space Ray *indirect_ray = &ss_indirect->rays[ss_indirect->num_rays];
+	PathRadiance *indirect_L = &ss_indirect->L[ss_indirect->num_rays];
+
+	*state = ss_indirect->state[ss_indirect->num_rays];
+	*ray = *indirect_ray;
+	*L = *indirect_L;
+	*throughput = ss_indirect->throughputs[ss_indirect->num_rays];
+
+	state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
+}
+
+#endif  /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
index fea503d06e5..076c82f3853 100644
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -16,16 +16,22 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__)
-
+#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__)
 /* branched path tracing: connect path directly to position on one or more lights and add it to L */
-ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, PathState *state, float3 throughput,
-	float num_samples_adjust, PathRadiance *L, int sample_all_lights)
+ccl_device_noinline void kernel_branched_path_surface_connect_light(
+        KernelGlobals *kg,
+        RNG *rng,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        ccl_addr_space PathState *state,
+        float3 throughput,
+        float num_samples_adjust,
+        PathRadiance *L,
+        int sample_all_lights)
 {
 #ifdef __EMISSION__
 	/* sample illumination from lights to find path contribution */
-	if(!(ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))
+	if(!(sd->flag & SD_BSDF_HAS_EVAL))
 		return;
 
 	Ray light_ray;
@@ -33,7 +39,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 	bool is_lamp;
 
 #  ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #  endif
 
 	if(sample_all_lights) {
@@ -52,7 +58,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 				float terminate = path_branched_rng_light_termination(kg, &lamp_rng, state, j, num_samples);
 
 				LightSample ls;
-				if(lamp_light_sample(kg, i, light_u, light_v, ccl_fetch(sd, P), &ls)) {
+				if(lamp_light_sample(kg, i, light_u, light_v, sd->P, &ls)) {
 					/* The sampling probability returned by lamp_light_sample assumes that all lights were sampled.
 					 * However, this code only samples lamps, so if the scene also had mesh lights, the real probability is twice as high. */
 					if(kernel_data.integrator.pdf_triangles != 0.0f)
@@ -66,6 +72,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 							/* accumulate */
 							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 						}
+						else {
+							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+						}
 					}
 				}
 			}
@@ -87,7 +96,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 					light_t = 0.5f*light_t;
 
 				LightSample ls;
-				if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+				if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 					/* Same as above, probability needs to be corrected since the sampling was forced to select a mesh light. */
 					if(kernel_data.integrator.num_all_lights)
 						ls.pdf *= 2.0f;
@@ -100,6 +109,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 							/* accumulate */
 							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
 						}
+						else {
+							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+						}
 					}
 				}
 			}
@@ -113,7 +125,7 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 		float terminate = path_state_rng_light_termination(kg, rng, state);
 
 		LightSample ls;
-		if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+		if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 			/* sample random light */
 			if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 				/* trace shadow ray */
@@ -123,6 +135,9 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 					/* accumulate */
 					path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
 				}
+				else {
+					path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+				}
 			}
 		}
 	}
@@ -130,9 +145,17 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(KernelGlobal
 }
 
 /* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, const ShaderClosure *sc, int sample, int num_samples,
-	float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+ccl_device bool kernel_branched_path_surface_bounce(
+        KernelGlobals *kg,
+        RNG *rng,
+        ShaderData *sd,
+        const ShaderClosure *sc,
+        int sample,
+        int num_samples,
+        ccl_addr_space float3 *throughput,
+        ccl_addr_space PathState *state,
+        PathRadiance *L,
+        Ray *ray)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
@@ -156,15 +179,15 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 	path_state_next(kg, state, label);
 
 	/* setup ray */
-	ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+	ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 	ray->D = normalize(bsdf_omega_in);
 	ray->t = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-	ray->dP = ccl_fetch(sd, dP);
+	ray->dP = sd->dP;
 	ray->dD = bsdf_domega_in;
 #endif
 #ifdef __OBJECT_MOTION__
-	ray->time = ccl_fetch(sd, time);
+	ray->time = sd->time;
 #endif
 
 #ifdef __VOLUME__
@@ -188,15 +211,29 @@ ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg, RNG *rng,
 
 #endif
 
-#ifndef __SPLIT_KERNEL__
 /* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_addr_space RNG *rng,
+ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG *rng,
 	ShaderData *sd, ShaderData *emission_sd, float3 throughput, ccl_addr_space PathState *state,
 	PathRadiance *L)
 {
 #ifdef __EMISSION__
-	if(!(kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
+	if(!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)))
+		return;
+
+#ifdef __SHADOW_TRICKS__
+	if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+		kernel_branched_path_surface_connect_light(kg,
+		                                           rng,
+		                                           sd,
+		                                           emission_sd,
+		                                           state,
+		                                           throughput,
+		                                           1.0f,
+		                                           L,
+		                                           1);
 		return;
+	}
+#endif
 
 	/* sample illumination from lights to find path contribution */
 	float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
@@ -208,11 +245,11 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 	bool is_lamp;
 
 #ifdef __OBJECT_MOTION__
-	light_ray.time = ccl_fetch(sd, time);
+	light_ray.time = sd->time;
 #endif
 
 	LightSample ls;
-	if(light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls)) {
+	if(light_sample(kg, light_t, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
 		float terminate = path_state_rng_light_termination(kg, rng, state);
 		if(direct_emission(kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 			/* trace shadow ray */
@@ -222,15 +259,17 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, ccl_
 				/* accumulate */
 				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
 			}
+			else {
+				path_radiance_accum_total_light(L, throughput, &L_light);
+			}
 		}
 	}
 #endif
 }
-#endif
 
 /* path tracing: bounce off or through surface to with new direction stored in ray */
 ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           ccl_addr_space RNG *rng,
+                                           RNG *rng,
                                            ShaderData *sd,
                                            ccl_addr_space float3 *throughput,
                                            ccl_addr_space PathState *state,
@@ -238,7 +277,7 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
                                            ccl_addr_space Ray *ray)
 {
 	/* no BSDF? we can stop here */
-	if(ccl_fetch(sd, flag) & SD_BSDF) {
+	if(sd->flag & SD_BSDF) {
 		/* sample BSDF */
 		float bsdf_pdf;
 		BsdfEval bsdf_eval;
@@ -270,16 +309,16 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		path_state_next(kg, state, label);
 
 		/* setup ray */
-		ray->P = ray_offset(ccl_fetch(sd, P), (label & LABEL_TRANSMIT)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT)? -sd->Ng: sd->Ng);
 		ray->D = normalize(bsdf_omega_in);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 		ray->dD = bsdf_domega_in;
 #endif
 
@@ -291,21 +330,21 @@ ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
 		return true;
 	}
 #ifdef __VOLUME__
-	else if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME) {
+	else if(sd->flag & SD_HAS_ONLY_VOLUME) {
 		/* no surface shader but have a volume shader? act transparent */
 
 		/* update path state, count as transparent */
 		path_state_next(kg, state, LABEL_TRANSPARENT);
 
 		if(state->bounce == 0)
-			ray->t -= ccl_fetch(sd, ray_length); /* clipping works through transparent */
+			ray->t -= sd->ray_length; /* clipping works through transparent */
 		else
 			ray->t = FLT_MAX;
 
 		/* setup ray position, direction stays unchanged */
-		ray->P = ray_offset(ccl_fetch(sd, P), -ccl_fetch(sd, Ng));
+		ray->P = ray_offset(sd->P, -sd->Ng);
 #ifdef __RAY_DIFFERENTIALS__
-		ray->dP = ccl_fetch(sd, dP);
+		ray->dP = sd->dP;
 #endif
 
 		/* enter/exit volume */
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index 3d3b7385d8b..371f2c1c7cb 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -24,7 +24,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
         ShaderData *sd,
         ShaderData *emission_sd,
         float3 throughput,
-        PathState *state,
+        ccl_addr_space PathState *state,
         PathRadiance *L)
 {
 #ifdef __EMISSION__
@@ -59,7 +59,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
 			}
 		}
 	}
-#endif
+#endif /* __EMISSION__ */
 }
 
 #ifdef __KERNEL_GPU__
@@ -67,8 +67,14 @@ ccl_device_noinline
 #else
 ccl_device
 #endif
-bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, float3 *throughput, PathState *state, PathRadiance *L, Ray *ray)
+bool kernel_path_volume_bounce(
+    KernelGlobals *kg,
+    RNG *rng,
+    ShaderData *sd,
+    ccl_addr_space float3 *throughput,
+    ccl_addr_space PathState *state,
+    PathRadiance *L,
+    ccl_addr_space Ray *ray)
 {
 	/* sample phase function */
 	float phase_pdf;
@@ -111,9 +117,18 @@ bool kernel_path_volume_bounce(KernelGlobals *kg, RNG *rng,
 	return true;
 }
 
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG *rng,
-	ShaderData *sd, ShaderData *emission_sd, float3 throughput, PathState *state, PathRadiance *L,
-	bool sample_all_lights, Ray *ray, const VolumeSegment *segment)
+#ifndef __SPLIT_KERNEL__
+ccl_device void kernel_branched_path_volume_connect_light(
+        KernelGlobals *kg,
+        RNG *rng,
+        ShaderData *sd,
+        ShaderData *emission_sd,
+        float3 throughput,
+        ccl_addr_space PathState *state,
+        PathRadiance *L,
+        bool sample_all_lights,
+        Ray *ray,
+        const VolumeSegment *segment)
 {
 #ifdef __EMISSION__
 	if(!kernel_data.integrator.use_direct_light)
@@ -261,10 +276,11 @@ ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg, RNG
 			}
 		}
 	}
-#endif
+#endif /* __EMISSION__ */
 }
+#endif /* __SPLIT_KERNEL__ */
 
-#endif
+#endif /* __VOLUME_SCATTER__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
index cf5614b8a86..96bc636d5ac 100644
--- a/intern/cycles/kernel/kernel_queues.h
+++ b/intern/cycles/kernel/kernel_queues.h
@@ -17,12 +17,15 @@
 #ifndef __KERNEL_QUEUE_H__
 #define __KERNEL_QUEUE_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Queue utility functions for split kernel
  */
-
+#ifdef __KERNEL_OPENCL__
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#endif
 
 /*
  * Enqueue ray index into the queue
@@ -35,7 +38,8 @@ ccl_device void enqueue_ray_index(
         ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
 {
 	/* This thread's queue index. */
-	int my_queue_index = atomic_inc(&queue_index[queue_number]) + (queue_number * queue_size);
+	int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint*)&queue_index[queue_number])
+	                   + (queue_number * queue_size);
 	queues[my_queue_index] = ray_index;
 }
 
@@ -47,6 +51,7 @@ ccl_device void enqueue_ray_index(
  * is no more ray to allocate to other threads.
  */
 ccl_device int get_ray_index(
+        KernelGlobals *kg,
         int thread_index,       /* Global thread index. */
         int queue_number,       /* Queue to operate on. */
         ccl_global int *queues, /* Buffer of all queues. */
@@ -68,24 +73,25 @@ ccl_device void enqueue_ray_index_local(
         int queue_number,                            /* Queue in which to enqueue ray index. */
         char enqueue_flag,                           /* True for threads whose ray index has to be enqueued. */
         int queuesize,                               /* queue size. */
-        ccl_local unsigned int *local_queue_atomics,   /* To to local queue atomics. */
+        ccl_local_param unsigned int *local_queue_atomics,   /* To to local queue atomics. */
         ccl_global int *Queue_data,                  /* Queues. */
         ccl_global int *Queue_index)                 /* To do global queue atomics. */
 {
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
 
 	/* Get local queue id .*/
 	unsigned int lqidx;
 	if(enqueue_flag) {
-		lqidx = atomic_inc(local_queue_atomics);
+		lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue offset. */
 	if(lidx == 0) {
-		*local_queue_atomics = atomic_add(&Queue_index[queue_number], *local_queue_atomics);
+		*local_queue_atomics = atomic_fetch_and_add_uint32((ccl_global uint*)&Queue_index[queue_number],
+		                                                   *local_queue_atomics);
 	}
-	barrier(CLK_LOCAL_MEM_FENCE);
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
 
 	/* Get global queue index and enqueue ray. */
 	if(enqueue_flag) {
@@ -96,19 +102,19 @@ ccl_device void enqueue_ray_index_local(
 
 ccl_device unsigned int get_local_queue_index(
         int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-        ccl_local unsigned int *local_queue_atomics)
+        ccl_local_param unsigned int *local_queue_atomics)
 {
-	int my_lqidx = atomic_inc(&local_queue_atomics[queue_number]);
+	int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
 	return my_lqidx;
 }
 
 ccl_device unsigned int get_global_per_queue_offset(
         int queue_number,
-        ccl_local unsigned int *local_queue_atomics,
+        ccl_local_param unsigned int *local_queue_atomics,
         ccl_global int* global_queue_atomics)
 {
-	unsigned int queue_offset = atomic_add(&global_queue_atomics[queue_number],
-	                                       local_queue_atomics[queue_number]);
+	unsigned int queue_offset = atomic_fetch_and_add_uint32((ccl_global uint*)&global_queue_atomics[queue_number],
+	                                                        local_queue_atomics[queue_number]);
 	return queue_offset;
 }
 
@@ -116,10 +122,12 @@ ccl_device unsigned int get_global_queue_index(
     int queue_number,
     int queuesize,
     unsigned int lqidx,
-    ccl_local unsigned int * global_per_queue_offset)
+    ccl_local_param unsigned int * global_per_queue_offset)
 {
 	int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
 	return my_gqidx;
 }
 
+CCL_NAMESPACE_END
+
 #endif // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index e773753396f..d4f0caff5de 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "kernel_jitter.h"
+#include "kernel/kernel_jitter.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -98,7 +98,7 @@ ccl_device uint sobol_lookup(const uint m, const uint frame, const uint ex, cons
 	return index;
 }
 
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -130,7 +130,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 #ifdef __CMJ__
 	if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) {
@@ -147,7 +147,7 @@ ccl_device_forceinline void path_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *r
 	}
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, ccl_addr_space RNG *rng, int x, int y, float *fx, float *fy)
+ccl_device_inline void path_rng_init(KernelGlobals *kg, ccl_global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy)
 {
 #ifdef __SOBOL_FULL_SCREEN__
 	uint px, py;
@@ -191,14 +191,14 @@ ccl_device void path_rng_end(KernelGlobals *kg, ccl_global uint *rng_state, RNG
 
 /* Linear Congruential Generator */
 
-ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension)
 {
 	/* implicit mod 2^32 */
-	rng = (1103515245*(rng) + 12345);
-	return (float)rng * (1.0f/(float)0xFFFFFFFF);
+	*rng = (1103515245*(*rng) + 12345);
+	return (float)*rng * (1.0f/(float)0xFFFFFFFF);
 }
 
-ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy)
+ccl_device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy)
 {
 	*fx = path_rng_1D(kg, rng, sample, num_samples, dimension);
 	*fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1);
@@ -259,12 +259,12 @@ ccl_device uint lcg_init(uint seed)
  * For branches in the path we must be careful not to reuse the same number
  * in a sequence and offset accordingly. */
 
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension)
+ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension)
 {
 	/* the rng_offset is not increased for transparent bounces. if we do then
 	 * fully transparent objects can become subtly visible by the different
@@ -277,29 +277,29 @@ ccl_device_inline float path_state_rng_1D_for_decision(KernelGlobals *kg, ccl_ad
 	return path_rng_1D(kg, rng, state->sample, state->num_samples, rng_offset + dimension);
 }
 
-ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
+ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample, state->num_samples, state->rng_offset + dimension, fx, fy);
 }
 
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
 {
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension);
 }
 
-ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension)
+ccl_device_inline float path_branched_rng_1D_for_decision(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension)
 {
 	int rng_offset = state->rng_offset + state->transparent_bounce*PRNG_BOUNCE_NUM;
 	return path_rng_1D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, rng_offset + dimension);
 }
 
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
+ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches, int dimension, float *fx, float *fy)
 {
 	path_rng_2D(kg, rng, state->sample*num_branches + branch, state->num_samples*num_branches, state->rng_offset + dimension, fx, fy);
 }
 
 /* Utitility functions to get light termination value, since it might not be needed in many cases. */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const ccl_addr_space PathState *state)
+ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
 		return path_state_rng_1D_for_decision(kg, rng, state, PRNG_LIGHT_TERMINATE);
@@ -307,7 +307,7 @@ ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg, ccl_
 	return 0.0f;
 }
 
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, ccl_addr_space RNG *rng, const PathState *state, int branch, int num_branches)
+ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, RNG *rng, const ccl_addr_space PathState *state, int branch, int num_branches)
 {
 	if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
 		return path_branched_rng_1D_for_decision(kg, rng, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
@@ -315,7 +315,7 @@ ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg, c
 	return 0.0f;
 }
 
-ccl_device_inline void path_state_branch(PathState *state, int branch, int num_branches)
+ccl_device_inline void path_state_branch(ccl_addr_space PathState *state, int branch, int num_branches)
 {
 	/* path is splitting into a branch, adjust so that each branch
 	 * still gets a unique sample from the same sequence */
@@ -324,18 +324,9 @@ ccl_device_inline void path_state_branch(PathState *state, int branch, int num_b
 	state->num_samples = state->num_samples*num_branches;
 }
 
-ccl_device_inline uint lcg_state_init(RNG *rng, const PathState *state, uint scramble)
-{
-	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
-}
-
-/* TODO(sergey): For until we can use generic address space from OpenCL 2.0. */
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space RNG *rng,
-                                                const ccl_addr_space PathState *state,
-                                                uint scramble)
+ccl_device_inline uint lcg_state_init(RNG *rng, int rng_offset, int sample, uint scramble)
 {
-	return lcg_init(*rng + state->rng_offset + state->sample*scramble);
+	return lcg_init(*rng + rng_offset + sample*scramble);
 }
 
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index d0826e5e879..8c0c5e90a3e 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -24,12 +24,12 @@
  *
  */
 
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf.h"
-#include "closure/emissive.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf.h"
+#include "kernel/closure/emissive.h"
 
-#include "svm/svm.h"
+#include "kernel/svm/svm.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,13 +38,13 @@ CCL_NAMESPACE_BEGIN
 #ifdef __OBJECT_MOTION__
 ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
 {
-	if(ccl_fetch(sd, object_flag) & SD_OBJECT_MOTION) {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform_motion(kg, ccl_fetch(sd, object), time);
-		ccl_fetch(sd, ob_itfm) = transform_quick_inverse(ccl_fetch(sd, ob_tfm));
+	if(sd->object_flag & SD_OBJECT_MOTION) {
+		sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
+		sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
 	}
 	else {
-		ccl_fetch(sd, ob_tfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
-		ccl_fetch(sd, ob_itfm) = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
+		sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+		sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
 	}
 }
 #endif
@@ -55,55 +55,55 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
                                                const Ray *ray)
 {
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
+	sd->object = (isect->object == PRIM_NONE)? kernel_tex_fetch(__prim_object, isect->prim): isect->object;
 #endif
 
-	ccl_fetch(sd, type) = isect->type;
-	ccl_fetch(sd, flag) = 0;
-	ccl_fetch(sd, object_flag) = kernel_tex_fetch(__object_flag,
-	                                              ccl_fetch(sd, object));
+	sd->type = isect->type;
+	sd->flag = 0;
+	sd->object_flag = kernel_tex_fetch(__object_flag,
+	                                              sd->object);
 
 	/* matrices and time */
 #ifdef __OBJECT_MOTION__
 	shader_setup_object_transforms(kg, sd, ray->time);
-	ccl_fetch(sd, time) = ray->time;
+	sd->time = ray->time;
 #endif
 
-	ccl_fetch(sd, prim) = kernel_tex_fetch(__prim_index, isect->prim);
-	ccl_fetch(sd, ray_length) = isect->t;
+	sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+	sd->ray_length = isect->t;
 
 #ifdef __UV__
-	ccl_fetch(sd, u) = isect->u;
-	ccl_fetch(sd, v) = isect->v;
+	sd->u = isect->u;
+	sd->v = isect->v;
 #endif
 
 #ifdef __HAIR__
-	if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+	if(sd->type & PRIMITIVE_ALL_CURVE) {
 		/* curve */
-		float4 curvedata = kernel_tex_fetch(__curves, ccl_fetch(sd, prim));
+		float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
 
-		ccl_fetch(sd, shader) = __float_as_int(curvedata.z);
-		ccl_fetch(sd, P) = bvh_curve_refine(kg, sd, isect, ray);
+		sd->shader = __float_as_int(curvedata.z);
+		sd->P = bvh_curve_refine(kg, sd, isect, ray);
 	}
 	else
 #endif
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* static triangle */
 		float3 Ng = triangle_normal(kg, sd);
-		ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
+		sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
 
 		/* vectors */
-		ccl_fetch(sd, P) = triangle_refine(kg, sd, isect, ray);
-		ccl_fetch(sd, Ng) = Ng;
-		ccl_fetch(sd, N) = Ng;
+		sd->P = triangle_refine(kg, sd, isect, ray);
+		sd->Ng = Ng;
+		sd->N = Ng;
 		
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL)
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL)
+			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
 
 #ifdef __DPDU__
 		/* dPdu/dPdv */
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 #endif
 	}
 	else {
@@ -111,40 +111,40 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		motion_triangle_shader_setup(kg, sd, isect, ray, false);
 	}
 
-	ccl_fetch(sd, I) = -ray->D;
+	sd->I = -ray->D;
 
-	ccl_fetch(sd, flag) |= kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
+	sd->flag |= kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
 
 #ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #  ifdef __DPDU__
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #  endif
 	}
 #endif
 
 	/* backfacing test */
-	bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 	if(backfacing) {
-		ccl_fetch(sd, flag) |= SD_BACKFACING;
-		ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-		ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+		sd->flag |= SD_BACKFACING;
+		sd->Ng = -sd->Ng;
+		sd->N = -sd->N;
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-		ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+		sd->dPdu = -sd->dPdu;
+		sd->dPdv = -sd->dPdv;
 #endif
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	differential_transfer(&ccl_fetch(sd, dP), ray->dP, ray->D, ray->dD, ccl_fetch(sd, Ng), isect->t);
-	differential_incoming(&ccl_fetch(sd, dI), ray->dD);
-	differential_dudv(&ccl_fetch(sd, du), &ccl_fetch(sd, dv), ccl_fetch(sd, dPdu), ccl_fetch(sd, dPdv), ccl_fetch(sd, dP), ccl_fetch(sd, Ng));
+	differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
+	differential_incoming(&sd->dI, ray->dD);
+	differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
 #endif
 }
 
@@ -203,11 +203,11 @@ void shader_setup_from_subsurface(
 #  ifdef __INSTANCING__
 	if(isect->object != OBJECT_NONE) {
 		/* instance transform */
-		object_normal_transform(kg, sd, &sd->N);
-		object_normal_transform(kg, sd, &sd->Ng);
+		object_normal_transform_auto(kg, sd, &sd->N);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
 #    ifdef __DPDU__
-		object_dir_transform(kg, sd, &sd->dPdu);
-		object_dir_transform(kg, sd, &sd->dPdv);
+		object_dir_transform_auto(kg, sd, &sd->dPdu);
+		object_dir_transform_auto(kg, sd, &sd->dPdv);
 #    endif
 	}
 #  endif
@@ -249,106 +249,106 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
                                                 int lamp)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = P;
-	ccl_fetch(sd, N) = Ng;
-	ccl_fetch(sd, Ng) = Ng;
-	ccl_fetch(sd, I) = I;
-	ccl_fetch(sd, shader) = shader;
+	sd->P = P;
+	sd->N = Ng;
+	sd->Ng = Ng;
+	sd->I = I;
+	sd->shader = shader;
 	if(prim != PRIM_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_TRIANGLE;
+		sd->type = PRIMITIVE_TRIANGLE;
 	else if(lamp != LAMP_NONE)
-		ccl_fetch(sd, type) = PRIMITIVE_LAMP;
+		sd->type = PRIMITIVE_LAMP;
 	else
-		ccl_fetch(sd, type) = PRIMITIVE_NONE;
+		sd->type = PRIMITIVE_NONE;
 
 	/* primitive */
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = object;
+	sd->object = object;
 #endif
 	/* currently no access to bvh prim index for strand sd->prim*/
-	ccl_fetch(sd, prim) = prim;
+	sd->prim = prim;
 #ifdef __UV__
-	ccl_fetch(sd, u) = u;
-	ccl_fetch(sd, v) = v;
+	sd->u = u;
+	sd->v = v;
 #endif
-	ccl_fetch(sd, ray_length) = t;
+	sd->ray_length = t;
 
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	ccl_fetch(sd, object_flag) = 0;
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
-		ccl_fetch(sd, object_flag) |= kernel_tex_fetch(__object_flag,
-		                                               ccl_fetch(sd, object));
+	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->object_flag = 0;
+	if(sd->object != OBJECT_NONE) {
+		sd->object_flag |= kernel_tex_fetch(__object_flag,
+		                                               sd->object);
 
 #ifdef __OBJECT_MOTION__
 		shader_setup_object_transforms(kg, sd, time);
-		ccl_fetch(sd, time) = time;
+		sd->time = time;
 	}
 	else if(lamp != LAMP_NONE) {
-		ccl_fetch(sd, ob_tfm)  = lamp_fetch_transform(kg, lamp, false);
-		ccl_fetch(sd, ob_itfm) = lamp_fetch_transform(kg, lamp, true);
+		sd->ob_tfm  = lamp_fetch_transform(kg, lamp, false);
+		sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
 #endif
 	}
 
 	/* transform into world space */
 	if(object_space) {
-		object_position_transform_auto(kg, sd, &ccl_fetch(sd, P));
-		object_normal_transform_auto(kg, sd, &ccl_fetch(sd, Ng));
-		ccl_fetch(sd, N) = ccl_fetch(sd, Ng);
-		object_dir_transform_auto(kg, sd, &ccl_fetch(sd, I));
+		object_position_transform_auto(kg, sd, &sd->P);
+		object_normal_transform_auto(kg, sd, &sd->Ng);
+		sd->N = sd->Ng;
+		object_dir_transform_auto(kg, sd, &sd->I);
 	}
 
-	if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
-			ccl_fetch(sd, N) = triangle_smooth_normal(kg, ccl_fetch(sd, prim), ccl_fetch(sd, u), ccl_fetch(sd, v));
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
+			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
 
 #ifdef __INSTANCING__
-			if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
-				object_normal_transform_auto(kg, sd, &ccl_fetch(sd, N));
+			if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+				object_normal_transform_auto(kg, sd, &sd->N);
 			}
 #endif
 		}
 
 		/* dPdu/dPdv */
 #ifdef __DPDU__
-		triangle_dPdudv(kg, ccl_fetch(sd, prim), &ccl_fetch(sd, dPdu), &ccl_fetch(sd, dPdv));
+		triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
 
 #  ifdef __INSTANCING__
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdu));
-			object_dir_transform_auto(kg, sd, &ccl_fetch(sd, dPdv));
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+			object_dir_transform_auto(kg, sd, &sd->dPdu);
+			object_dir_transform_auto(kg, sd, &sd->dPdv);
 		}
 #  endif
 #endif
 	}
 	else {
 #ifdef __DPDU__
-		ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-		ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+		sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 	}
 
 	/* backfacing test */
-	if(ccl_fetch(sd, prim) != PRIM_NONE) {
-		bool backfacing = (dot(ccl_fetch(sd, Ng), ccl_fetch(sd, I)) < 0.0f);
+	if(sd->prim != PRIM_NONE) {
+		bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
 
 		if(backfacing) {
-			ccl_fetch(sd, flag) |= SD_BACKFACING;
-			ccl_fetch(sd, Ng) = -ccl_fetch(sd, Ng);
-			ccl_fetch(sd, N) = -ccl_fetch(sd, N);
+			sd->flag |= SD_BACKFACING;
+			sd->Ng = -sd->Ng;
+			sd->N = -sd->N;
 #ifdef __DPDU__
-			ccl_fetch(sd, dPdu) = -ccl_fetch(sd, dPdu);
-			ccl_fetch(sd, dPdv) = -ccl_fetch(sd, dPdv);
+			sd->dPdu = -sd->dPdu;
+			sd->dPdv = -sd->dPdv;
 #endif
 		}
 	}
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* no ray differentials here yet */
-	ccl_fetch(sd, dP) = differential3_zero();
-	ccl_fetch(sd, dI) = differential3_zero();
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = differential3_zero();
+	sd->dI = differential3_zero();
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -378,39 +378,39 @@ ccl_device void shader_setup_from_displace(KernelGlobals *kg, ShaderData *sd,
 ccl_device_inline void shader_setup_from_background(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
 {
 	/* vectors */
-	ccl_fetch(sd, P) = ray->D;
-	ccl_fetch(sd, N) = -ray->D;
-	ccl_fetch(sd, Ng) = -ray->D;
-	ccl_fetch(sd, I) = -ray->D;
-	ccl_fetch(sd, shader) = kernel_data.background.surface_shader;
-	ccl_fetch(sd, flag) = kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*SHADER_SIZE);
-	ccl_fetch(sd, object_flag) = 0;
+	sd->P = ray->D;
+	sd->N = -ray->D;
+	sd->Ng = -ray->D;
+	sd->I = -ray->D;
+	sd->shader = kernel_data.background.surface_shader;
+	sd->flag = kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*SHADER_SIZE);
+	sd->object_flag = 0;
 #ifdef __OBJECT_MOTION__
-	ccl_fetch(sd, time) = ray->time;
+	sd->time = ray->time;
 #endif
-	ccl_fetch(sd, ray_length) = 0.0f;
+	sd->ray_length = 0.0f;
 
 #ifdef __INSTANCING__
-	ccl_fetch(sd, object) = PRIM_NONE;
+	sd->object = PRIM_NONE;
 #endif
-	ccl_fetch(sd, prim) = PRIM_NONE;
+	sd->prim = PRIM_NONE;
 #ifdef __UV__
-	ccl_fetch(sd, u) = 0.0f;
-	ccl_fetch(sd, v) = 0.0f;
+	sd->u = 0.0f;
+	sd->v = 0.0f;
 #endif
 
 #ifdef __DPDU__
 	/* dPdu/dPdv */
-	ccl_fetch(sd, dPdu) = make_float3(0.0f, 0.0f, 0.0f);
-	ccl_fetch(sd, dPdv) = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdu = make_float3(0.0f, 0.0f, 0.0f);
+	sd->dPdv = make_float3(0.0f, 0.0f, 0.0f);
 #endif
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differentials */
-	ccl_fetch(sd, dP) = ray->dD;
-	differential_incoming(&ccl_fetch(sd, dI), ccl_fetch(sd, dP));
-	ccl_fetch(sd, du) = differential_zero();
-	ccl_fetch(sd, dv) = differential_zero();
+	sd->dP = ray->dD;
+	differential_incoming(&sd->dI, sd->dP);
+	sd->du = differential_zero();
+	sd->dv = differential_zero();
 #endif
 }
 
@@ -505,18 +505,18 @@ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd
 {
 	/* this is the veach one-sample model with balance heuristic, some pdf
 	 * factors drop out when using balance heuristic weighting */
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
+	for(int i = 0; i < sd->num_closure; i++) {
 		if(i == skip_bsdf)
 			continue;
 
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
 
 			if(bsdf_pdf != 0.0f) {
-				bsdf_eval_accum(result_eval, sc->type, eval*sc->weight);
+				bsdf_eval_accum(result_eval, sc->type, eval*sc->weight, 1.0f);
 				sum_pdf += bsdf_pdf*sc->sample_weight;
 			}
 
@@ -535,8 +535,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
                                                         float light_pdf,
                                                         bool use_mis)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 		if(CLOSURE_IS_BSDF(sc->type)) {
 			float bsdf_pdf = 0.0f;
 			float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
@@ -544,7 +544,8 @@ ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
 				float mis_weight = use_mis? power_heuristic(light_pdf, bsdf_pdf): 1.0f;
 				bsdf_eval_accum(result_eval,
 				                sc->type,
-				                eval * sc->weight * mis_weight);
+				                eval * sc->weight,
+				                mis_weight);
 			}
 		}
 	}
@@ -576,7 +577,7 @@ void shader_bsdf_eval(KernelGlobals *kg,
 		_shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
 		if(use_mis) {
 			float weight = power_heuristic(light_pdf, pdf);
-			bsdf_eval_mul(eval, weight);
+			bsdf_eval_mis(eval, weight);
 		}
 	}
 }
@@ -591,22 +592,22 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 {
 	int sampled = 0;
 
-	if(ccl_fetch(sd, num_closure) > 1) {
+	if(sd->num_closure > 1) {
 		/* pick a BSDF closure based on sample weights */
 		float sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+		for(sampled = 0; sampled < sd->num_closure; sampled++) {
+			const ShaderClosure *sc = &sd->closure[sampled];
 			
 			if(CLOSURE_IS_BSDF(sc->type))
 				sum += sc->sample_weight;
 		}
 
-		float r = ccl_fetch(sd, randb_closure)*sum;
+		float r = sd->randb_closure*sum;
 		sum = 0.0f;
 
-		for(sampled = 0; sampled < ccl_fetch(sd, num_closure); sampled++) {
-			const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+		for(sampled = 0; sampled < sd->num_closure; sampled++) {
+			const ShaderClosure *sc = &sd->closure[sampled];
 			
 			if(CLOSURE_IS_BSDF(sc->type)) {
 				sum += sc->sample_weight;
@@ -616,13 +617,13 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 			}
 		}
 
-		if(sampled == ccl_fetch(sd, num_closure)) {
+		if(sampled == sd->num_closure) {
 			*pdf = 0.0f;
 			return LABEL_NONE;
 		}
 	}
 
-	const ShaderClosure *sc = ccl_fetch_array(sd, closure, sampled);
+	const ShaderClosure *sc = &sd->closure[sampled];
 
 	int label;
 	float3 eval;
@@ -633,7 +634,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
 	if(*pdf != 0.0f) {
 		bsdf_eval_init(bsdf_eval, sc->type, eval*sc->weight, kernel_data.film.use_light_pass);
 
-		if(ccl_fetch(sd, num_closure) > 1) {
+		if(sd->num_closure > 1) {
 			float sweight = sc->sample_weight;
 			_shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
 		}
@@ -660,8 +661,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg, ShaderData *sd,
 
 ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
 {
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF(sc->type))
 			bsdf_blur(kg, sc, roughness);
@@ -670,13 +671,13 @@ ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughn
 
 ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 {
-	if(ccl_fetch(sd, flag) & SD_HAS_ONLY_VOLUME)
+	if(sd->flag & SD_HAS_ONLY_VOLUME)
 		return make_float3(1.0f, 1.0f, 1.0f);
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) // todo: make this work for osl
 			eval += sc->weight;
@@ -685,6 +686,18 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, ShaderData *sd)
 	return eval;
 }
 
+ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+{
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID) {
+			sc->sample_weight = 0.0f;
+			sc->weight = make_float3(0.0f, 0.0f, 0.0f);
+		}
+	}
+}
+
 ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 alpha = make_float3(1.0f, 1.0f, 1.0f) - shader_bsdf_transparency(kg, sd);
@@ -699,8 +712,8 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type))
 			eval += sc->weight;
@@ -713,8 +726,8 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_GLOSSY(sc->type))
 			eval += sc->weight;
@@ -727,8 +740,8 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
 			eval += sc->weight;
@@ -741,8 +754,8 @@ ccl_device float3 shader_bsdf_subsurface(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type) || CLOSURE_IS_BSDF_BSSRDF(sc->type))
 			eval += sc->weight;
@@ -756,8 +769,8 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
 			const DiffuseBsdf *bsdf = (const DiffuseBsdf*)sc;
@@ -766,12 +779,12 @@ ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_fac
 		}
 		else if(CLOSURE_IS_AMBIENT_OCCLUSION(sc->type)) {
 			eval += sc->weight;
-			N += ccl_fetch(sd, N)*average(sc->weight);
+			N += sd->N*average(sc->weight);
 		}
 	}
 
 	if(is_zero(N))
-		N = ccl_fetch(sd, N);
+		N = sd->N;
 	else
 		N = normalize(N);
 
@@ -786,8 +799,8 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	float3 N = make_float3(0.0f, 0.0f, 0.0f);
 	float texture_blur = 0.0f, weight_sum = 0.0f;
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BSSRDF(sc->type)) {
 			const Bssrdf *bssrdf = (const Bssrdf*)sc;
@@ -801,10 +814,10 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 	}
 
 	if(N_)
-		*N_ = (is_zero(N))? ccl_fetch(sd, N): normalize(N);
+		*N_ = (is_zero(N))? sd->N: normalize(N);
 
 	if(texture_blur_)
-		*texture_blur_ = texture_blur/weight_sum;
+		*texture_blur_ = safe_divide(texture_blur, weight_sum);
 	
 	return eval;
 }
@@ -814,7 +827,7 @@ ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_b
 
 ccl_device float3 emissive_eval(KernelGlobals *kg, ShaderData *sd, ShaderClosure *sc)
 {
-	return emissive_simple_eval(ccl_fetch(sd, Ng), ccl_fetch(sd, I));
+	return emissive_simple_eval(sd->Ng, sd->I);
 }
 
 ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
@@ -822,8 +835,8 @@ ccl_device float3 shader_emissive_eval(KernelGlobals *kg, ShaderData *sd)
 	float3 eval;
 	eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_EMISSION(sc->type))
 			eval += emissive_eval(kg, sd, sc)*sc->weight;
@@ -838,8 +851,8 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 {
 	float3 weight = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_HOLDOUT(sc->type))
 			weight += sc->weight;
@@ -850,12 +863,12 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_space RNG *rng,
+ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, RNG *rng,
 	ccl_addr_space PathState *state, float randb, int path_flag, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = randb;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = randb;
 
 #ifdef __OSL__
 	if(kg->osl)
@@ -869,13 +882,13 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 		DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd,
 		                                             sizeof(DiffuseBsdf),
 		                                             make_float3(0.8f, 0.8f, 0.8f));
-		bsdf->N = ccl_fetch(sd, N);
-		ccl_fetch(sd, flag) |= bsdf_diffuse_setup(bsdf);
+		bsdf->N = sd->N;
+		sd->flag |= bsdf_diffuse_setup(bsdf);
 #endif
 	}
 
-	if(rng && (ccl_fetch(sd, flag) & SD_BSDF_NEEDS_LCG)) {
-		ccl_fetch(sd, lcg_state) = lcg_state_init_addrspace(rng, state, 0xb4bc3953);
+	if(rng && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+		sd->lcg_state = lcg_state_init(rng, state->rng_offset, state->sample, 0xb4bc3953);
 	}
 }
 
@@ -884,9 +897,9 @@ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, ccl_addr_
 ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
 	ccl_addr_space PathState *state, int path_flag, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = 0.0f;
 
 #ifdef __SVM__
 #ifdef __OSL__
@@ -901,8 +914,8 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
 
 	float3 eval = make_float3(0.0f, 0.0f, 0.0f);
 
-	for(int i = 0; i < ccl_fetch(sd, num_closure); i++) {
-		const ShaderClosure *sc = ccl_fetch_array(sd, closure, i);
+	for(int i = 0; i < sd->num_closure; i++) {
+		const ShaderClosure *sc = &sd->closure[i];
 
 		if(CLOSURE_IS_BACKGROUND(sc->type))
 			eval += sc->weight;
@@ -932,7 +945,7 @@ ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd, con
 			float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
 
 			if(phase_pdf != 0.0f) {
-				bsdf_eval_accum(result_eval, sc->type, eval);
+				bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
 				sum_pdf += phase_pdf*sc->sample_weight;
 			}
 
@@ -1024,8 +1037,8 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg, const ShaderData *
 
 ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
                                           ShaderData *sd,
-                                          PathState *state,
-                                          VolumeStack *stack,
+                                          ccl_addr_space PathState *state,
+                                          ccl_addr_space VolumeStack *stack,
                                           int path_flag,
                                           ShaderContext ctx)
 {
@@ -1081,9 +1094,9 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderContext ctx)
 {
-	ccl_fetch(sd, num_closure) = 0;
-	ccl_fetch(sd, num_closure_extra) = 0;
-	ccl_fetch(sd, randb_closure) = 0.0f;
+	sd->num_closure = 0;
+	sd->num_closure_extra = 0;
+	sd->randb_closure = 0.0f;
 
 	/* this will modify sd->P */
 #ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index 2981f6ac566..0426e0a62c9 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -16,9 +16,84 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __SHADOW_RECORD_ALL__
+/* Attenuate throughput accordingly to the given intersection event.
+ * Returns true if the throughput is zero and traversal can be aborted.
+ */
+ccl_device_forceinline bool shadow_handle_transparent_isect(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+#    ifdef __VOLUME__
+        ccl_addr_space struct PathState *volume_state,
+#    endif
+        Intersection *isect,
+        Ray *ray,
+        float3 *throughput)
+{
+#ifdef __VOLUME__
+	/* Attenuation between last surface and next surface. */
+	if(volume_state->volume_stack[0].shader != SHADER_NONE) {
+		Ray segment_ray = *ray;
+		segment_ray.t = isect->t;
+		kernel_volume_shadow(kg,
+		                     shadow_sd,
+		                     volume_state,
+		                     &segment_ray,
+		                     throughput);
+	}
+#endif
+	/* Setup shader data at surface. */
+	shader_setup_from_ray(kg, shadow_sd, isect, ray);
+	/* Attenuation from transparent surface. */
+	if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+		path_state_modify_bounce(state, true);
+		shader_eval_surface(kg,
+		                    shadow_sd,
+		                    NULL,
+		                    state,
+		                    0.0f,
+		                    PATH_RAY_SHADOW,
+		                    SHADER_CONTEXT_SHADOW);
+		path_state_modify_bounce(state, false);
+		*throughput *= shader_bsdf_transparency(kg, shadow_sd);
+	}
+	/* Stop if all light is blocked. */
+	if(is_zero(*throughput)) {
+		return true;
+	}
+#ifdef __VOLUME__
+	/* Exit/enter volume. */
+	kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
+#endif
+	return false;
+}
+
+/* Special version which only handles opaque shadows. */
+ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
+                                      ShaderData *shadow_sd,
+                                      ccl_addr_space PathState *state,
+                                      Ray *ray,
+                                      Intersection *isect,
+                                      float3 *shadow)
+{
+	const bool blocked = scene_intersect(kg,
+	                                     *ray,
+	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     isect,
+	                                     NULL,
+	                                     0.0f, 0.0f);
+#ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* Apply attenuation from current volume shader. */
+		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
+	}
+#endif
+	return blocked;
+}
 
-/* Shadow function to compute how much light is blocked, CPU variation.
+#ifdef __TRANSPARENT_SHADOWS__
+#  ifdef __SHADOW_RECORD_ALL__
+/* Shadow function to compute how much light is blocked,
  *
  * We trace a single ray. If it hits any opaque surface, or more than a given
  * number of transparent surfaces is hit, then we consider the geometry to be
@@ -36,261 +111,403 @@ CCL_NAMESPACE_BEGIN
  * or there is a performance increase anyway due to avoiding the need to send
  * two rays with transparent shadows.
  *
- * This is CPU only because of qsort, and malloc or high stack space usage to
- * record all these intersections. */
+ * On CPU it'll handle all transparent bounces (by allocating storage for
+ * intersections when they don't fit into the stack storage).
+ *
+ * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
+ * is something to be kept an eye on.
+ */
 
-#define STACK_MAX_HITS 64
+#    define SHADOW_STACK_MAX_HITS 64
 
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *shadow)
+/* Actual logic with traversal loop implementation which is free from device
+ * specific tweaks.
+ *
+ * Note that hits array should be as big as max_hits+1.
+ */
+ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
+                                                    ShaderData *shadow_sd,
+                                                    ccl_addr_space PathState *state,
+                                                    const int skip_object,
+                                                    Ray *ray,
+                                                    Intersection *hits,
+                                                    uint max_hits,
+                                                    float3 *shadow)
 {
-	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray->t == 0.0f)
-		return false;
-	
-	bool blocked;
-
-	if(kernel_data.integrator.transparent_shadows) {
-		/* check transparent bounces here, for volume scatter which can do
-		 * lighting before surface path termination is checked */
-		if(state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce)
-			return true;
-
-		/* intersect to find an opaque surface, or record all transparent surface hits */
-		Intersection hits_stack[STACK_MAX_HITS];
-		Intersection *hits = hits_stack;
-		const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-		uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-
-		/* prefer to use stack but use dynamic allocation if too deep max hits
-		 * we need max_hits + 1 storage space due to the logic in
-		 * scene_intersect_shadow_all which will first store and then check if
-		 * the limit is exceeded */
-		if(max_hits + 1 > STACK_MAX_HITS) {
-			if(kg->transparent_shadow_intersections == NULL) {
-				kg->transparent_shadow_intersections =
-				    (Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+	/* Intersect to find an opaque surface, or record all transparent
+	 * surface hits.
+	 */
+	uint num_hits;
+	const bool blocked = scene_intersect_shadow_all(kg,
+	                                                ray,
+	                                                hits,
+	                                                skip_object,
+	                                                max_hits,
+	                                                &num_hits);
+	/* If no opaque surface found but we did find transparent hits,
+	 * shade them.
+	 */
+	if(!blocked && num_hits > 0) {
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+		float3 Pend = ray->P + ray->D*ray->t;
+		float last_t = 0.0f;
+		int bounce = state->transparent_bounce;
+		Intersection *isect = hits;
+#    ifdef __VOLUME__
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#      else
+		PathState ps_object;
+		PathState *ps = &ps_object;
+#      endif
+		*ps = *state;
+#    endif
+		sort_intersections(hits, num_hits);
+		for(int hit = 0; hit < num_hits; hit++, isect++) {
+			/* Adjust intersection distance for moving ray forward. */
+			float new_t = isect->t;
+			isect->t -= last_t;
+			/* Skip hit if we did not move forward, step by step raytracing
+			 * would have skipped it as well then.
+			 */
+			if(last_t == new_t) {
+				continue;
 			}
-			hits = kg->transparent_shadow_intersections;
-		}
-
-		uint num_hits;
-		blocked = scene_intersect_shadow_all(kg, ray, hits, max_hits, &num_hits);
-
-		/* if no opaque surface found but we did find transparent hits, shade them */
-		if(!blocked && num_hits > 0) {
-			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-			float3 Pend = ray->P + ray->D*ray->t;
-			float last_t = 0.0f;
-			int bounce = state->transparent_bounce;
-			Intersection *isect = hits;
+			last_t = new_t;
+			/* Attenuate the throughput. */
+			if(shadow_handle_transparent_isect(kg,
+			                                   shadow_sd,
+			                                   state,
 #ifdef __VOLUME__
-			PathState ps = *state;
+			                                   ps,
 #endif
-
-			qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-			for(int hit = 0; hit < num_hits; hit++, isect++) {
-				/* adjust intersection distance for moving ray forward */
-				float new_t = isect->t;
-				isect->t -= last_t;
-
-				/* skip hit if we did not move forward, step by step raytracing
-				 * would have skipped it as well then */
-				if(last_t == new_t)
-					continue;
-
-				last_t = new_t;
-
-#ifdef __VOLUME__
-				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NONE) {
-					Ray segment_ray = *ray;
-					segment_ray.t = isect->t;
-					kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
-				}
-#endif
-
-				/* setup shader data at surface */
-				shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
-				/* attenuation from transparent surface */
-				if(!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-					path_state_modify_bounce(state, true);
-					shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					path_state_modify_bounce(state, false);
-
-					throughput *= shader_bsdf_transparency(kg, shadow_sd);
-				}
-
-				/* stop if all light is blocked */
-				if(is_zero(throughput)) {
-					return true;
-				}
-
-				/* move ray forward */
-				ray->P = shadow_sd->P;
-				if(ray->t != FLT_MAX) {
-					ray->D = normalize_len(Pend - ray->P, &ray->t);
-				}
-
-#ifdef __VOLUME__
-				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
-#endif
-
-				bounce++;
+			                                   isect,
+			                                   ray,
+			                                   &throughput))
+			{
+				return true;
 			}
-
-#ifdef __VOLUME__
-			/* attenuation for last line segment towards light */
-			if(ps.volume_stack[0].shader != SHADER_NONE)
-				kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
-#endif
-
-			*shadow = throughput;
-
-			return is_zero(throughput);
+			/* Move ray forward. */
+			ray->P = shadow_sd->P;
+			if(ray->t != FLT_MAX) {
+				ray->D = normalize_len(Pend - ray->P, &ray->t);
+			}
+			bounce++;
 		}
+#    ifdef __VOLUME__
+		/* Attenuation for last line segment towards light. */
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
+		}
+#    endif
+		*shadow = throughput;
+		return is_zero(throughput);
 	}
-	else {
-		Intersection isect;
-		blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, &isect, NULL, 0.0f, 0.0f);
-	}
-
-#ifdef __VOLUME__
+#    ifdef __VOLUME__
 	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* apply attenuation from current volume shader */
+		/* Apply attenuation from current volume shader/ */
 		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
 	}
-#endif
-
+#    endif
 	return blocked;
 }
 
-#undef STACK_MAX_HITS
-
-#else
+/* Here we do all device specific trickery before invoking actual traversal
+ * loop to help readability of the actual logic.
+ */
+ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
+                                               ShaderData *shadow_sd,
+                                               ccl_addr_space PathState *state,
+                                               const int skip_object,
+                                               Ray *ray,
+                                               uint max_hits,
+                                               float3 *shadow)
+{
+#    ifdef __SPLIT_KERNEL__
+	Intersection hits_[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = &hits_[0];
+#    elif defined(__KERNEL_CUDA__)
+	Intersection *hits = kg->hits_stack;
+#    else
+	Intersection hits_stack[SHADOW_STACK_MAX_HITS];
+	Intersection *hits = hits_stack;
+#    endif
+#    ifndef __KERNEL_GPU__
+	/* Prefer to use stack but use dynamic allocation if too deep max hits
+	 * we need max_hits + 1 storage space due to the logic in
+	 * scene_intersect_shadow_all which will first store and then check if
+	 * the limit is exceeded.
+	 *
+	 * Ignore this on GPU because of slow/unavailable malloc().
+	 */
+	if(max_hits + 1 > SHADOW_STACK_MAX_HITS) {
+		if(kg->transparent_shadow_intersections == NULL) {
+			const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+			kg->transparent_shadow_intersections =
+				(Intersection*)malloc(sizeof(Intersection)*(transparent_max_bounce + 1));
+		}
+		hits = kg->transparent_shadow_intersections;
+	}
+#    endif  /* __KERNEL_GPU__ */
+	/* Invoke actual traversal. */
+	return shadow_blocked_transparent_all_loop(kg,
+	                                           shadow_sd,
+	                                           state,
+	                                           skip_object,
+	                                           ray,
+	                                           hits,
+	                                           max_hits,
+	                                           shadow);
+}
+#  endif  /* __SHADOW_RECORD_ALL__ */
 
-/* Shadow function to compute how much light is blocked, GPU variation.
+#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
+/* Shadow function to compute how much light is blocked,
  *
  * Here we raytrace from one transparent surface to the next step by step.
  * To minimize overhead in cases where we don't need transparent shadows, we
  * first trace a regular shadow ray. We check if the hit primitive was
  * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency. */
+ * one extra ray cast for the cases were we do want transparency.
+ */
 
-ccl_device_noinline bool shadow_blocked(KernelGlobals *kg,
-                                        ShaderData *shadow_sd,
-                                        ccl_addr_space PathState *state,
-                                        ccl_addr_space Ray *ray_input,
-                                        float3 *shadow)
+/* This function is only implementing device-independent traversal logic
+ * which requires some precalculation done.
+ */
+ccl_device bool shadow_blocked_transparent_stepped_loop(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+        const int skip_object,
+        Ray *ray,
+        Intersection *isect,
+        const bool blocked,
+        const bool is_transparent_isect,
+        float3 *shadow)
 {
-	*shadow = make_float3(1.0f, 1.0f, 1.0f);
-
-	if(ray_input->t == 0.0f)
-		return false;
-
-#ifdef __SPLIT_KERNEL__
-	Ray private_ray = *ray_input;
-	Ray *ray = &private_ray;
-#else
-	Ray *ray = ray_input;
-#endif
-
-#ifdef __SPLIT_KERNEL__
-	Intersection *isect = &kg->isect_shadow[SD_THREAD];
-#else
-	Intersection isect_object;
-	Intersection *isect = &isect_object;
-#endif
-
-	bool blocked = scene_intersect(kg, *ray, PATH_RAY_SHADOW_OPAQUE, isect, NULL, 0.0f, 0.0f);
-
-#ifdef __TRANSPARENT_SHADOWS__
-	if(blocked && kernel_data.integrator.transparent_shadows) {
-		if(shader_transparent_shadow(kg, isect)) {
-			float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-			float3 Pend = ray->P + ray->D*ray->t;
-			int bounce = state->transparent_bounce;
-#ifdef __VOLUME__
-			PathState ps = *state;
-#endif
-
-			for(;;) {
-				if(bounce >= kernel_data.integrator.transparent_max_bounce)
-					return true;
-
-				if(!scene_intersect(kg, *ray, PATH_RAY_SHADOW_TRANSPARENT, isect, NULL, 0.0f, 0.0f))
-				{
-#ifdef __VOLUME__
-					/* attenuation for last line segment towards light */
-					if(ps.volume_stack[0].shader != SHADER_NONE)
-						kernel_volume_shadow(kg, shadow_sd, &ps, ray, &throughput);
-#endif
-
-					*shadow *= throughput;
-
-					return false;
-				}
-
-				if(!shader_transparent_shadow(kg, isect)) {
-					return true;
-				}
-
-#ifdef __VOLUME__
-				/* attenuation between last surface and next surface */
-				if(ps.volume_stack[0].shader != SHADER_NONE) {
-					Ray segment_ray = *ray;
-					segment_ray.t = isect->t;
-					kernel_volume_shadow(kg, shadow_sd, &ps, &segment_ray, &throughput);
+	if((blocked && is_transparent_isect) || skip_object != OBJECT_NONE) {
+		float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
+		float3 Pend = ray->P + ray->D*ray->t;
+		int bounce = state->transparent_bounce;
+#    ifdef __VOLUME__
+#      ifdef __SPLIT_KERNEL__
+		ccl_addr_space PathState *ps = &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
+#      else
+		PathState ps_object;
+		PathState *ps = &ps_object;
+#      endif
+		*ps = *state;
+#    endif
+		for(;;) {
+			if(bounce >= kernel_data.integrator.transparent_max_bounce) {
+				return true;
+			}
+			if(!scene_intersect(kg,
+			                    *ray,
+			                    PATH_RAY_SHADOW_TRANSPARENT,
+			                    isect,
+			                    NULL,
+			                    0.0f, 0.0f))
+			{
+				break;
+			}
+#ifdef __SHADOW_TRICKS__
+			if(skip_object != OBJECT_NONE) {
+				const int isect_object = (isect->object == PRIM_NONE)
+				        ? kernel_tex_fetch(__prim_object, isect->prim)
+				        : isect->object;
+				if(isect_object == skip_object) {
+					shader_setup_from_ray(kg, shadow_sd, isect, ray);
+					/* Move ray forward. */
+					ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
+					if(ray->t != FLT_MAX) {
+						ray->D = normalize_len(Pend - ray->P, &ray->t);
+					}
+					bounce++;
+					continue;
 				}
+			}
 #endif
-
-				/* setup shader data at surface */
-				shader_setup_from_ray(kg, shadow_sd, isect, ray);
-
-				/* attenuation from transparent surface */
-				if(!(ccl_fetch(shadow_sd, flag) & SD_HAS_ONLY_VOLUME)) {
-					path_state_modify_bounce(state, true);
-					shader_eval_surface(kg, shadow_sd, NULL, state, 0.0f, PATH_RAY_SHADOW, SHADER_CONTEXT_SHADOW);
-					path_state_modify_bounce(state, false);
-
-					throughput *= shader_bsdf_transparency(kg, shadow_sd);
-				}
-
-				/* stop if all light is blocked */
-				if(is_zero(throughput)) {
-					return true;
-				}
-
-				/* move ray forward */
-				ray->P = ray_offset(ccl_fetch(shadow_sd, P), -ccl_fetch(shadow_sd, Ng));
-				if(ray->t != FLT_MAX) {
-					ray->D = normalize_len(Pend - ray->P, &ray->t);
-				}
-
+			if(!shader_transparent_shadow(kg, isect)) {
+				return true;
+			}
+			/* Attenuate the throughput. */
+			if(shadow_handle_transparent_isect(kg,
+			                                   shadow_sd,
+			                                   state,
 #ifdef __VOLUME__
-				/* exit/enter volume */
-				kernel_volume_stack_enter_exit(kg, shadow_sd, ps.volume_stack);
+			                                   ps,
 #endif
-
-				bounce++;
+			                                   isect,
+			                                   ray,
+			                                   &throughput))
+			{
+				return true;
 			}
+			/* Move ray forward. */
+			ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
+			if(ray->t != FLT_MAX) {
+				ray->D = normalize_len(Pend - ray->P, &ray->t);
+			}
+			bounce++;
+		}
+#    ifdef __VOLUME__
+		/* Attenuation for last line segment towards light. */
+		if(ps->volume_stack[0].shader != SHADER_NONE) {
+			kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
 		}
+#    endif
+		*shadow *= throughput;
+		return is_zero(throughput);
 	}
-#ifdef __VOLUME__
-	else if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-		/* apply attenuation from current volume shader */
+#    ifdef __VOLUME__
+	if(!blocked && state->volume_stack[0].shader != SHADER_NONE) {
+		/* Apply attenuation from current volume shader. */
 		kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
 	}
-#endif
-#endif
-
+#    endif
 	return blocked;
 }
 
+ccl_device bool shadow_blocked_transparent_stepped(
+        KernelGlobals *kg,
+        ShaderData *shadow_sd,
+        ccl_addr_space PathState *state,
+        const int skip_object,
+        Ray *ray,
+        Intersection *isect,
+        float3 *shadow)
+{
+	bool blocked, is_transparent_isect;
+	if (skip_object == OBJECT_NONE) {
+		blocked = scene_intersect(kg,
+		                          *ray,
+		                          PATH_RAY_SHADOW_OPAQUE,
+		                          isect,
+		                          NULL,
+		                          0.0f, 0.0f);
+		is_transparent_isect = blocked
+			        ? shader_transparent_shadow(kg, isect)
+			        : false;
+	}
+	else {
+		blocked = false;
+		is_transparent_isect = false;
+	}
+	return shadow_blocked_transparent_stepped_loop(kg,
+	                                               shadow_sd,
+	                                               state,
+	                                               skip_object,
+	                                               ray,
+	                                               isect,
+	                                               blocked,
+	                                               is_transparent_isect,
+	                                               shadow);
+}
+
+#  endif  /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
+#endif /* __TRANSPARENT_SHADOWS__ */
+
+ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
+                                      ShaderData *shadow_sd,
+                                      ccl_addr_space PathState *state,
+                                      Ray *ray_input,
+                                      float3 *shadow)
+{
+	Ray *ray = ray_input;
+	Intersection isect;
+	/* Some common early checks. */
+	*shadow = make_float3(1.0f, 1.0f, 1.0f);
+	if(ray->t == 0.0f) {
+		return false;
+	}
+#ifdef __SHADOW_TRICKS__
+    const int skip_object = state->catcher_object;
+#else
+    const int skip_object = OBJECT_NONE;
 #endif
+	/* Do actual shadow shading. */
+	/* First of all, we check if integrator requires transparent shadows.
+	 * if not, we use simplest and fastest ever way to calculate occlusion.
+	 *
+	 * NOTE: We can't do quick opaque test here if we are on shadow-catcher
+	 * path because we don't want catcher object to be casting shadow here.
+	 */
+#ifdef __TRANSPARENT_SHADOWS__
+	if(!kernel_data.integrator.transparent_shadows &&
+	   skip_object == OBJECT_NONE)
+#endif
+	{
+		return shadow_blocked_opaque(kg,
+		                             shadow_sd,
+		                             state,
+		                             ray,
+		                             &isect,
+		                             shadow);
+	}
+#ifdef __TRANSPARENT_SHADOWS__
+#  ifdef __SHADOW_RECORD_ALL__
+	/* For the transparent shadows we try to use record-all logic on the
+	 * devices which supports this.
+	 */
+	const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+	/* Check transparent bounces here, for volume scatter which can do
+	 * lighting before surface path termination is checked.
+	 */
+	if(state->transparent_bounce >= transparent_max_bounce) {
+		return true;
+	}
+	const uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
+#    ifdef __KERNEL_GPU__
+	/* On GPU we do trickey with tracing opaque ray first, this avoids speed
+	 * regressions in some files.
+	 *
+	 * TODO(sergey): Check why using record-all behavior causes slowdown in such
+	 * cases. Could that be caused by a higher spill pressure?
+	 */
+	const bool blocked = scene_intersect(kg,
+	                                     *ray,
+	                                     PATH_RAY_SHADOW_OPAQUE,
+	                                     &isect,
+	                                     NULL,
+	                                     0.0f, 0.0f);
+	const bool is_transparent_isect = blocked
+	        ? shader_transparent_shadow(kg, &isect)
+	        : false;
+	if(!blocked || !is_transparent_isect ||
+	   max_hits + 1 >= SHADOW_STACK_MAX_HITS)
+	{
+		return shadow_blocked_transparent_stepped_loop(kg,
+		                                               shadow_sd,
+		                                               state,
+		                                               skip_object,
+		                                               ray,
+		                                               &isect,
+		                                               blocked,
+		                                               is_transparent_isect,
+		                                               shadow);
+	}
+#    endif  /* __KERNEL_GPU__ */
+	return shadow_blocked_transparent_all(kg,
+	                                      shadow_sd,
+	                                      state,
+	                                      skip_object,
+	                                      ray,
+	                                      max_hits,
+	                                      shadow);
+#  else  /* __SHADOW_RECORD_ALL__ */
+	/* Fallback to a slowest version which works on all devices. */
+	return shadow_blocked_transparent_stepped(kg,
+	                                          shadow_sd,
+	                                          state,
+	                                          skip_object,
+	                                          ray,
+	                                          &isect,
+	                                          shadow);
+#  endif  /* __SHADOW_RECORD_ALL__ */
+#endif  /* __TRANSPARENT_SHADOWS__ */
+}
 
-CCL_NAMESPACE_END
+#undef SHADOW_STACK_MAX_HITS
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index c5652ebf7dc..6c8b7cca4ce 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -201,7 +201,7 @@ ccl_device float3 subsurface_color_pow(float3 color, float exponent)
 
 ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
                                            ShaderData *sd,
-                                           PathState *state,
+                                           ccl_addr_space PathState *state,
                                            int state_flag,
                                            float3 *eval,
                                            float3 *N)
@@ -239,7 +239,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
         SubsurfaceIntersection *ss_isect,
         ShaderData *sd,
         ShaderClosure *sc,
-        uint *lcg_state,
+        RNG *lcg_state,
         float disk_u,
         float disk_v,
         bool all)
@@ -293,7 +293,12 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	float3 disk_P = (disk_r*cosf(phi)) * disk_T + (disk_r*sinf(phi)) * disk_B;
 
 	/* create ray */
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
 	Ray *ray = &ss_isect->ray;
+#endif
 	ray->P = sd->P + disk_N*disk_height + disk_P;
 	ray->D = -disk_N;
 	ray->t = 2.0f*disk_height;
@@ -304,7 +309,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	/* intersect with the same object. if multiple intersections are found it
 	 * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
 	scene_intersect_subsurface(kg,
-	                           ray,
+	                           *ray,
 	                           ss_isect,
 	                           sd->object,
 	                           lcg_state,
@@ -314,20 +319,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 	for(int hit = 0; hit < num_eval_hits; hit++) {
 		/* Quickly retrieve P and Ng without setting up ShaderData. */
 		float3 hit_P;
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE) {
+		if(sd->type & PRIMITIVE_TRIANGLE) {
 			hit_P = triangle_refine_subsurface(kg,
 			                                   sd,
 			                                   &ss_isect->hits[hit],
 			                                   ray);
 		}
 #ifdef __OBJECT_MOTION__
-		else  if(ccl_fetch(sd, type) & PRIMITIVE_MOTION_TRIANGLE) {
+		else  if(sd->type & PRIMITIVE_MOTION_TRIANGLE) {
 			float3 verts[3];
 			motion_triangle_vertices(
 			        kg,
-			        ccl_fetch(sd, object),
+			        sd->object,
 			        kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-			        ccl_fetch(sd, time),
+			        sd->time,
 			        verts);
 			hit_P = motion_triangle_refine_subsurface(kg,
 			                                          sd,
@@ -367,6 +372,10 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
 		ss_isect->weight[hit] = eval;
 	}
 
+#ifdef __SPLIT_KERNEL__
+	ss_isect->ray = *ray;
+#endif
+
 	return num_eval_hits;
 }
 
@@ -375,13 +384,19 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
         SubsurfaceIntersection* ss_isect,
         int hit,
         ShaderData *sd,
-        PathState *state,
+        ccl_addr_space PathState *state,
         int state_flag,
         ShaderClosure *sc,
         bool all)
 {
+#ifdef __SPLIT_KERNEL__
+	Ray ray_object = ss_isect->ray;
+	Ray *ray = &ray_object;
+#else
+	Ray *ray = &ss_isect->ray;
+#endif
 	/* Setup new shading point. */
-	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], &ss_isect->ray);
+	shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
 
 	/* Optionally blur colors and bump mapping. */
 	float3 weight = ss_isect->weight[hit];
@@ -392,6 +407,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, weight, true, N);
 }
 
+#ifndef __SPLIT_KERNEL__
 /* subsurface scattering step, from a point on the surface to another nearby point on the same object */
 ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathState *state,
 	int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
@@ -448,7 +464,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	/* intersect with the same object. if multiple intersections are
 	 * found it will randomly pick one of them */
 	SubsurfaceIntersection ss_isect;
-	scene_intersect_subsurface(kg, &ray, &ss_isect, sd->object, lcg_state, 1);
+	scene_intersect_subsurface(kg, ray, &ss_isect, sd->object, lcg_state, 1);
 
 	/* evaluate bssrdf */
 	if(ss_isect.num_hits > 0) {
@@ -481,6 +497,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, PathS
 	/* setup diffuse bsdf */
 	subsurface_scatter_setup_diffuse_bsdf(sd, sc, eval, (ss_isect.num_hits > 0), N);
 }
+#endif /* ! __SPLIT_KERNEL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index 8d5bb75a428..cb1a3f40dee 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -32,6 +32,7 @@ KERNEL_TEX(uint, texture_uint, __prim_visibility)
 KERNEL_TEX(uint, texture_uint, __prim_index)
 KERNEL_TEX(uint, texture_uint, __prim_object)
 KERNEL_TEX(uint, texture_uint, __object_node)
+KERNEL_TEX(float2, texture_float2, __prim_time)
 
 /* objects */
 KERNEL_TEX(float4, texture_float4, __objects)
@@ -177,7 +178,6 @@ KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_085)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_086)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_087)
 KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_088)
-KERNEL_IMAGE_TEX(uchar4, texture_image_uchar4, __tex_image_byte4_089)
 
 #  else
 /* bindless textures */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 8c271c75e44..19c91248922 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -17,9 +17,9 @@
 #ifndef __KERNEL_TYPES_H__
 #define __KERNEL_TYPES_H__
 
-#include "kernel_math.h"
-#include "svm/svm_types.h"
-#include "util_static_assert.h"
+#include "kernel/kernel_math.h"
+#include "kernel/svm/svm_types.h"
+#include "util/util_static_assert.h"
 
 #ifndef __KERNEL_GPU__
 #  define __KERNEL_CPU__
@@ -56,6 +56,8 @@ CCL_NAMESPACE_BEGIN
 
 #define VOLUME_STACK_SIZE		16
 
+#define WORK_POOL_SIZE 64
+
 /* device capabilities */
 #ifdef __KERNEL_CPU__
 #  ifdef __KERNEL_SSE2__
@@ -63,27 +65,34 @@ CCL_NAMESPACE_BEGIN
 #  endif
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
+#  ifndef __SPLIT_KERNEL__
+#    define __BRANCHED_PATH__
+#  endif
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
 #  define __SUBSURFACE__
 #  define __CMJ__
 #  define __VOLUME__
-#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_SCATTER__
 #  define __SHADOW_RECORD_ALL__
-#  define __VOLUME_RECORD_ALL__
+#  ifndef __SPLIT_KERNEL__
+#    define __VOLUME_DECOUPLED__
+#    define __VOLUME_RECORD_ALL__
+#  endif
 #endif  /* __KERNEL_CPU__ */
 
 #ifdef __KERNEL_CUDA__
 #  define __KERNEL_SHADING__
 #  define __KERNEL_ADV_SHADING__
-#  define __BRANCHED_PATH__
 #  define __VOLUME__
 #  define __VOLUME_SCATTER__
 #  define __SUBSURFACE__
-#  define __CMJ__
+#  define __SHADOW_RECORD_ALL__
+#  ifndef __SPLIT_KERNEL__
+#    define __BRANCHED_PATH__
+#    define __CMJ__
+#  endif
 #endif  /* __KERNEL_CUDA__ */
 
 #ifdef __KERNEL_OPENCL__
@@ -93,6 +102,10 @@ CCL_NAMESPACE_BEGIN
 #  ifdef __KERNEL_OPENCL_NVIDIA__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __SUBSURFACE__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
 #    ifdef __KERNEL_EXPERIMENTAL__
 #      define __CMJ__
 #    endif
@@ -114,6 +127,10 @@ CCL_NAMESPACE_BEGIN
 #    define __CL_USE_NATIVE__
 #    define __KERNEL_SHADING__
 #    define __KERNEL_ADV_SHADING__
+#    define __SUBSURFACE__
+#    define __VOLUME__
+#    define __VOLUME_SCATTER__
+#    define __SHADOW_RECORD_ALL__
 #  endif  /* __KERNEL_OPENCL_AMD__ */
 
 #  ifdef __KERNEL_OPENCL_INTEL_CPU__
@@ -140,6 +157,7 @@ CCL_NAMESPACE_BEGIN
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
 #define __PATCH_EVAL__
+#define __SHADOW_TRICKS__
 
 #ifdef __KERNEL_SHADING__
 #  define __SVM__
@@ -195,6 +213,9 @@ CCL_NAMESPACE_BEGIN
 #ifdef __NO_TRANSPARENT__
 #  undef __TRANSPARENT_SHADOWS__
 #endif
+#ifdef __NO_SHADOW_TRICKS__
+#undef __SHADOW_TRICKS__
+#endif
 
 /* Random Numbers */
 
@@ -299,6 +320,8 @@ enum PathRayFlag {
 	PATH_RAY_MIS_SKIP = 4096,
 	PATH_RAY_DIFFUSE_ANCESTOR = 8192,
 	PATH_RAY_SINGLE_PASS_DONE = 16384,
+	PATH_RAY_SHADOW_CATCHER = 32768,
+	PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
 };
 
 /* Closure Label */
@@ -428,6 +451,20 @@ typedef ccl_addr_space struct PathRadiance {
 	float4 shadow;
 	float mist;
 #endif
+
+#ifdef __SHADOW_TRICKS__
+	/* Total light reachable across the path, ignoring shadow blocked queries. */
+	float3 path_total;
+	/* Total light reachable across the path with shadow blocked queries
+	 * applied here.
+	 *
+	 * Dividing this figure by path_total will give estimate of shadow pass.
+	 */
+	float3 path_total_shaded;
+
+	/* Color of the background on which shadow is alpha-overed. */
+	float3 shadow_color;
+#endif
 } PathRadiance;
 
 typedef struct BsdfEval {
@@ -443,6 +480,9 @@ typedef struct BsdfEval {
 	float3 subsurface;
 	float3 scatter;
 #endif
+#ifdef __SHADOW_TRICKS__
+	float3 sum_no_mis;
+#endif
 } BsdfEval;
 
 /* Shader Flag */
@@ -536,7 +576,7 @@ typedef struct Ray {
 
 /* Intersection */
 
-typedef ccl_addr_space struct Intersection {
+typedef struct Intersection {
 	float t, u, v;
 	int prim;
 	int object;
@@ -788,108 +828,89 @@ enum ShaderDataObjectFlag {
 	SD_OBJECT_INTERSECTS_VOLUME      = (1 << 5),
 	/* Has position for motion vertices. */
 	SD_OBJECT_HAS_VERTEX_MOTION      = (1 << 6),
+	/* object is used to catch shadows */
+	SD_OBJECT_SHADOW_CATCHER         = (1 << 7),
 
 	SD_OBJECT_FLAGS = (SD_OBJECT_HOLDOUT_MASK |
 	                   SD_OBJECT_MOTION |
 	                   SD_OBJECT_TRANSFORM_APPLIED |
 	                   SD_OBJECT_NEGATIVE_SCALE_APPLIED |
 	                   SD_OBJECT_HAS_VOLUME |
-	                   SD_OBJECT_INTERSECTS_VOLUME)
+	                   SD_OBJECT_INTERSECTS_VOLUME |
+	                   SD_OBJECT_SHADOW_CATCHER)
 };
 
-#ifdef __SPLIT_KERNEL__
-#  define SD_THREAD (get_global_id(1) * get_global_size(0) + get_global_id(0))
-#  if !defined(__SPLIT_KERNEL_SOA__)
-     /* ShaderData is stored as an Array-of-Structures */
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (s[SD_THREAD].soa_##t)
-#    define ccl_fetch_array(s, t, index) (&s[SD_THREAD].soa_##t[index])
-#  else
-     /* ShaderData is stored as an Structure-of-Arrays */
-#    define SD_GLOBAL_SIZE (get_global_size(0) * get_global_size(1))
-#    define SD_FIELD_SIZE(t) sizeof(((struct ShaderData*)0)->t)
-#    define SD_OFFSETOF(t) ((char*)(&((struct ShaderData*)0)->t) - (char*)0)
-#    define ccl_soa_member(type, name) type soa_##name
-#    define ccl_fetch(s, t) (((ShaderData*)((ccl_addr_space char*)s + SD_GLOBAL_SIZE * SD_OFFSETOF(soa_##t) +  SD_FIELD_SIZE(soa_##t) * SD_THREAD - SD_OFFSETOF(soa_##t)))->soa_##t)
-#    define ccl_fetch_array(s, t, index) (&ccl_fetch(s, t)[index])
-#  endif
-#else
-#  define ccl_soa_member(type, name) type name
-#  define ccl_fetch(s, t) (s->t)
-#  define ccl_fetch_array(s, t, index) (&s->t[index])
-#endif
-
 typedef ccl_addr_space struct ShaderData {
 	/* position */
-	ccl_soa_member(float3, P);
+	float3 P;
 	/* smooth normal for shading */
-	ccl_soa_member(float3, N);
+	float3 N;
 	/* true geometric normal */
-	ccl_soa_member(float3, Ng);
+	float3 Ng;
 	/* view/incoming direction */
-	ccl_soa_member(float3, I);
+	float3 I;
 	/* shader id */
-	ccl_soa_member(int, shader);
+	int shader;
 	/* booleans describing shader, see ShaderDataFlag */
-	ccl_soa_member(int, flag);
+	int flag;
 	/* booleans describing object of the shader, see ShaderDataObjectFlag */
-	ccl_soa_member(int, object_flag);
+	int object_flag;
 
 	/* primitive id if there is one, ~0 otherwise */
-	ccl_soa_member(int, prim);
+	int prim;
 
 	/* combined type and curve segment for hair */
-	ccl_soa_member(int, type);
+	int type;
 
 	/* parametric coordinates
 	 * - barycentric weights for triangles */
-	ccl_soa_member(float, u);
-	ccl_soa_member(float, v);
+	float u;
+	float v;
 	/* object id if there is one, ~0 otherwise */
-	ccl_soa_member(int, object);
+	int object;
 
 	/* motion blur sample time */
-	ccl_soa_member(float, time);
+	float time;
 
 	/* length of the ray being shaded */
-	ccl_soa_member(float, ray_length);
+	float ray_length;
 
 #ifdef __RAY_DIFFERENTIALS__
 	/* differential of P. these are orthogonal to Ng, not N */
-	ccl_soa_member(differential3, dP);
+	differential3 dP;
 	/* differential of I */
-	ccl_soa_member(differential3, dI);
+	differential3 dI;
 	/* differential of u, v */
-	ccl_soa_member(differential, du);
-	ccl_soa_member(differential, dv);
+	differential du;
+	differential dv;
 #endif
 #ifdef __DPDU__
 	/* differential of P w.r.t. parametric coordinates. note that dPdu is
 	 * not readily suitable as a tangent for shading on triangles. */
-	ccl_soa_member(float3, dPdu);
-	ccl_soa_member(float3, dPdv);
+	float3 dPdu;
+	float3 dPdv;
 #endif
 
 #ifdef __OBJECT_MOTION__
 	/* object <-> world space transformations, cached to avoid
 	 * re-interpolating them constantly for shading */
-	ccl_soa_member(Transform, ob_tfm);
-	ccl_soa_member(Transform, ob_itfm);
+	Transform ob_tfm;
+	Transform ob_itfm;
 #endif
 
 	/* Closure data, we store a fixed array of closures */
-	ccl_soa_member(struct ShaderClosure, closure[MAX_CLOSURE]);
-	ccl_soa_member(int, num_closure);
-	ccl_soa_member(int, num_closure_extra);
-	ccl_soa_member(float, randb_closure);
-	ccl_soa_member(float3, svm_closure_weight);
+	struct ShaderClosure closure[MAX_CLOSURE];
+	int num_closure;
+	int num_closure_extra;
+	float randb_closure;
+	float3 svm_closure_weight;
 
 	/* LCG state for closures that require additional random numbers. */
-	ccl_soa_member(uint, lcg_state);
+	uint lcg_state;
 
 	/* ray start position, only set for backgrounds */
-	ccl_soa_member(float3, ray_P);
-	ccl_soa_member(differential3, ray_dP);
+	float3 ray_P;
+	differential3 ray_dP;
 
 #ifdef __OSL__
 	struct KernelGlobals *osl_globals;
@@ -935,12 +956,16 @@ typedef struct PathState {
 	RNG rng_congruential;
 	VolumeStack volume_stack[VOLUME_STACK_SIZE];
 #endif
+
+#ifdef __SHADOW_TRICKS__
+	int catcher_object;
+#endif
 } PathState;
 
 /* Subsurface */
 
 /* Struct to gather multiple SSS hits. */
-struct SubsurfaceIntersection
+typedef struct SubsurfaceIntersection
 {
 	Ray ray;
 	float3 weight[BSSRDF_MAX_HITS];
@@ -948,10 +973,10 @@ struct SubsurfaceIntersection
 	int num_hits;
 	struct Intersection hits[BSSRDF_MAX_HITS];
 	float3 Ng[BSSRDF_MAX_HITS];
-};
+} SubsurfaceIntersection;
 
 /* Struct to gather SSS indirect rays and delay tracing them. */
-struct SubsurfaceIndirectRays
+typedef struct SubsurfaceIndirectRays
 {
 	bool need_update_volume_stack;
 	bool tracing;
@@ -962,7 +987,7 @@ struct SubsurfaceIndirectRays
 	struct Ray rays[BSSRDF_MAX_HITS];
 	float3 throughputs[BSSRDF_MAX_HITS];
 	struct PathRadiance L[BSSRDF_MAX_HITS];
-};
+} SubsurfaceIndirectRays;
 
 /* Constant Kernel Data
  *
@@ -1201,7 +1226,8 @@ typedef struct KernelBVH {
 	int have_curves;
 	int have_instancing;
 	int use_qbvh;
-	int pad1, pad2;
+	int use_bvh_steps;
+	int pad1;
 } KernelBVH;
 static_assert_align(KernelBVH, 16);
 
@@ -1296,20 +1322,19 @@ enum QueueNumber {
 #define RAY_STATE_MASK 0x007
 #define RAY_FLAG_MASK 0x0F8
 enum RayState {
+	RAY_INVALID = 0,
 	/* Denotes ray is actively involved in path-iteration. */
-	RAY_ACTIVE = 0,
+	RAY_ACTIVE,
 	/* Denotes ray has completed processing all samples and is inactive. */
-	RAY_INACTIVE = 1,
+	RAY_INACTIVE,
 	/* Denoted ray has exited path-iteration and needs to update output buffer. */
-	RAY_UPDATE_BUFFER = 2,
+	RAY_UPDATE_BUFFER,
 	/* Donotes ray has hit background */
-	RAY_HIT_BACKGROUND = 3,
+	RAY_HIT_BACKGROUND,
 	/* Denotes ray has to be regenerated */
-	RAY_TO_REGENERATE = 4,
+	RAY_TO_REGENERATE,
 	/* Denotes ray has been regenerated */
-	RAY_REGENERATED = 5,
-	/* Denotes ray should skip direct lighting */
-	RAY_SKIP_DL = 6,
+	RAY_REGENERATED,
 	/* Flag's ray has to execute shadow blocked function in AO part */
 	RAY_SHADOW_RAY_CAST_AO = 16,
 	/* Flag's ray has to execute shadow blocked function in direct lighting part. */
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index c7cb29b5af2..9c0878249d4 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -38,7 +38,7 @@ typedef struct VolumeShaderCoefficients {
 /* evaluate shader to get extinction coefficient at P */
 ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
                                                        ShaderData *sd,
-                                                       PathState *state,
+                                                       ccl_addr_space PathState *state,
                                                        float3 P,
                                                        float3 *extinction)
 {
@@ -64,7 +64,7 @@ ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
 /* evaluate shader to get absorption, scattering and emission at P */
 ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
                                             ShaderData *sd,
-                                            PathState *state,
+                                            ccl_addr_space PathState *state,
                                             float3 P,
                                             VolumeShaderCoefficients *coeff)
 {
@@ -112,7 +112,7 @@ ccl_device float kernel_volume_channel_get(float3 value, int channel)
 	return (channel == 0)? value.x: ((channel == 1)? value.y: value.z);
 }
 
-ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, VolumeStack *stack)
+ccl_device bool volume_stack_is_heterogeneous(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
 {
 	for(int i = 0; stack[i].shader != SHADER_NONE; i++) {
 		int shader_flag = kernel_tex_fetch(__shader_flag, (stack[i].shader & SHADER_MASK)*SHADER_SIZE);
@@ -161,7 +161,11 @@ ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stac
 
 /* homogeneous volume: assume shader evaluation at the starts gives
  * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
+ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
+                                                 ccl_addr_space PathState *state,
+                                                 Ray *ray,
+                                                 ShaderData *sd,
+                                                 float3 *throughput)
 {
 	float3 sigma_t;
 
@@ -171,7 +175,11 @@ ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg, PathState *s
 
 /* heterogeneous volume: integrate stepping through the volume until we
  * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState *state, Ray *ray, ShaderData *sd, float3 *throughput)
+ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
+                                                   ccl_addr_space PathState *state,
+                                                   Ray *ray,
+                                                   ShaderData *sd,
+                                                   float3 *throughput)
 {
 	float3 tp = *throughput;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -179,7 +187,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
 	float step = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step;
+	float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step;
 
 	/* compute extinction at the start */
 	float t = 0.0f;
@@ -193,7 +201,7 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 		/* use random position inside this segment to sample shader */
 		if(new_t == ray->t)
-			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+			random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * dt;
 
 		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
 		float3 sigma_t;
@@ -227,7 +235,11 @@ ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg, PathState
 
 /* get the volume attenuation over line segment defined by ray, with the
  * assumption that there are no surfaces blocking light between the endpoints */
-ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg, ShaderData *shadow_sd, PathState *state, Ray *ray, float3 *throughput)
+ccl_device_noinline void kernel_volume_shadow(KernelGlobals *kg,
+                                              ShaderData *shadow_sd,
+                                              ccl_addr_space PathState *state,
+                                              Ray *ray,
+                                              float3 *throughput)
 {
 	shader_setup_from_volume(kg, shadow_sd, ray);
 
@@ -341,9 +353,15 @@ ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coe
 
 /* homogeneous volume: assume shader evaluation at the start gives
  * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGlobals *kg,
-	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput,
-	RNG *rng, bool probalistic_scatter)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    Ray *ray,
+    ShaderData *sd,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    RNG *rng,
+    bool probalistic_scatter)
 {
 	VolumeShaderCoefficients coeff;
 
@@ -444,8 +462,14 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(KernelGloba
  * volume until we reach the end, get absorbed entirely, or run out of
  * iterations. this does probabilistically scatter or get transmitted through
  * for path tracing where we don't want to branch. */
-ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
-	PathState *state, Ray *ray, ShaderData *sd, PathRadiance *L, float3 *throughput, RNG *rng)
+ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    Ray *ray,
+    ShaderData *sd,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    RNG *rng)
 {
 	float3 tp = *throughput;
 	const float tp_eps = 1e-6f; /* todo: this is likely not the right value */
@@ -453,7 +477,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 	/* prepare for stepping */
 	int max_steps = kernel_data.integrator.volume_max_steps;
 	float step_size = kernel_data.integrator.volume_step_size;
-	float random_jitter_offset = lcg_step_float(&state->rng_congruential) * step_size;
+	float random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * step_size;
 
 	/* compute coefficients at the start */
 	float t = 0.0f;
@@ -474,7 +498,7 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
 
 		/* use random position inside this segment to sample shader */
 		if(new_t == ray->t)
-			random_jitter_offset = lcg_step_float(&state->rng_congruential) * dt;
+			random_jitter_offset = lcg_step_float_addrspace(&state->rng_congruential) * dt;
 
 		float3 new_P = ray->P + ray->D * (t + random_jitter_offset);
 		VolumeShaderCoefficients coeff;
@@ -579,8 +603,15 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
  * ray, with the assumption that there are no surfaces blocking light
  * between the endpoints. distance sampling is used to decide if we will
  * scatter or not. */
-ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals *kg,
-	PathState *state, ShaderData *sd, Ray *ray, PathRadiance *L, float3 *throughput, RNG *rng, bool heterogeneous)
+ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(
+    KernelGlobals *kg,
+    ccl_addr_space PathState *state,
+    ShaderData *sd,
+    Ray *ray,
+    PathRadiance *L,
+    ccl_addr_space float3 *throughput,
+    RNG *rng,
+    bool heterogeneous)
 {
 	shader_setup_from_volume(kg, sd, ray);
 
@@ -590,6 +621,7 @@ ccl_device_noinline VolumeIntegrateResult kernel_volume_integrate(KernelGlobals
 		return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, rng, true);
 }
 
+#ifndef __SPLIT_KERNEL__
 /* Decoupled Volume Sampling
  *
  * VolumeSegment is list of coefficients and transmittance stored at all steps
@@ -966,7 +998,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 			mis_weight = 2.0f*power_heuristic(pdf, distance_pdf);
 		}
 	}
-	if(sample_t < 1e-6f) {
+	if(sample_t < 1e-6f || pdf == 0.0f) {
 		return VOLUME_PATH_SCATTERED;
 	}
 
@@ -990,6 +1022,7 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
 
 	return VOLUME_PATH_SCATTERED;
 }
+#endif /* __SPLIT_KERNEL */
 
 /* decide if we need to use decoupled or not */
 ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneous, bool direct, int sampling_method)
@@ -1021,9 +1054,9 @@ ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg, bool heterogeneou
 
 ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
                                          ShaderData *stack_sd,
-                                         const PathState *state,
-                                         const Ray *ray,
-                                         VolumeStack *stack)
+                                         ccl_addr_space const PathState *state,
+                                         ccl_addr_space const Ray *ray,
+                                         ccl_addr_space VolumeStack *stack)
 {
 	/* NULL ray happens in the baker, does it need proper initialization of
 	 * camera in volume?
@@ -1166,7 +1199,7 @@ ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
 	}
 }
 
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, VolumeStack *stack)
+ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd, ccl_addr_space VolumeStack *stack)
 {
 	/* todo: we should have some way for objects to indicate if they want the
 	 * world shader to work inside them. excluding it by default is problematic
@@ -1215,7 +1248,7 @@ ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg, ShaderData *sd
 ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
                                                           ShaderData *stack_sd,
                                                           Ray *ray,
-                                                          VolumeStack *stack)
+                                                          ccl_addr_space VolumeStack *stack)
 {
 	kernel_assert(kernel_data.integrator.use_volumes);
 
@@ -1277,7 +1310,7 @@ ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
  * the world's one after the last bounce to avoid render artifacts.
  */
 ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
-                                                 VolumeStack *volume_stack)
+                                                 ccl_addr_space VolumeStack *volume_stack)
 {
 	if(kernel_data.background.volume_shader != SHADER_NONE) {
 		/* Keep the world's volume in stack. */
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 7d559b1aa31..28fc5ce1c30 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -17,177 +17,102 @@
 #ifndef __KERNEL_WORK_STEALING_H__
 #define __KERNEL_WORK_STEALING_H__
 
+CCL_NAMESPACE_BEGIN
+
 /*
  * Utility functions for work stealing
  */
 
-#ifdef __WORK_STEALING__
-
 #ifdef __KERNEL_OPENCL__
 #  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
 #endif
 
-uint get_group_id_with_ray_index(uint ray_index,
-                                 uint tile_dim_x,
-                                 uint tile_dim_y,
-                                 uint parallel_samples,
-                                 int dim)
+ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
+{
+	return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
+}
+
+ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
+{
+	return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
+{
+	return ray_index / WORK_POOL_SIZE;
+}
+
+ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
 {
-	if(dim == 0) {
-		uint x_span = ray_index % (tile_dim_x * parallel_samples);
-		return x_span / get_local_size(0);
+	uint total_work_size = kernel_total_work_size(kg);
+	uint num_pools = kernel_num_work_pools(kg);
+
+	if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
+		return 0;
+	}
+
+	uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
+
+	uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
+	if(work_pool < remainder / WORK_POOL_SIZE) {
+		work_size += WORK_POOL_SIZE;
 	}
-	else /*if(dim == 1)*/ {
-		kernel_assert(dim == 1);
-		uint y_span = ray_index / (tile_dim_x * parallel_samples);
-		return y_span / get_local_size(1);
+	else if(work_pool == remainder / WORK_POOL_SIZE) {
+		work_size += remainder % WORK_POOL_SIZE;
 	}
+
+	return work_size;
 }
 
-uint get_total_work(uint tile_dim_x,
-                    uint tile_dim_y,
-                    uint grp_idx,
-                    uint grp_idy,
-                    uint num_samples)
+ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
 {
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	return threads_within_tile_border_x *
-	       threads_within_tile_border_y *
-	       num_samples;
+	uint num_pools = kernel_num_work_pools(kg);
+	uint pool = work_pool_from_ray_index(kg, ray_index);
+
+	return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
+	       + (pool * WORK_POOL_SIZE)
+	       + (work_index % WORK_POOL_SIZE);
 }
 
-/* Returns 0 in case there is no next work available */
-/* Returns 1 in case work assigned is valid */
-int get_next_work(ccl_global uint *work_pool,
-                  ccl_private uint *my_work,
-                  uint tile_dim_x,
-                  uint tile_dim_y,
-                  uint num_samples,
-                  uint parallel_samples,
-                  uint ray_index)
+/* Returns true if there is work */
+ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint total_work = get_total_work(tile_dim_x,
-	                                 tile_dim_y,
-	                                 grp_idx,
-	                                 grp_idy,
-	                                 num_samples);
-	uint group_index = grp_idy * get_num_groups(0) + grp_idx;
-	*my_work = atomic_inc(&work_pool[group_index]);
-	return (*my_work < total_work) ? 1 : 0;
+	uint work_pool = work_pool_from_ray_index(kg, ray_index);
+	uint pool_size = work_pool_work_size(kg, work_pool);
+
+	if(pool_size == 0) {
+		return false;
+	}
+
+	*work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
+	return (*work_index < pool_size);
 }
 
-/* This function assumes that the passed my_work is valid. */
-/* Decode sample number w.r.t. assigned my_work. */
-uint get_my_sample(uint my_work,
-                   uint tile_dim_x,
-                   uint tile_dim_y,
-                   uint parallel_samples,
-                   uint ray_index)
+/* This function assumes that the passed `work` is valid. */
+/* Decode sample number w.r.t. assigned `work`. */
+ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	return my_work /
-	       (threads_within_tile_border_x * threads_within_tile_border_y);
+	return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
 }
 
-/* Decode pixel and tile position w.r.t. assigned my_work. */
-void get_pixel_tile_position(ccl_private uint *pixel_x,
+/* Decode pixel and tile position w.r.t. assigned `work`. */
+ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
+                             ccl_private uint *pixel_x,
                              ccl_private uint *pixel_y,
                              ccl_private uint *tile_x,
                              ccl_private uint *tile_y,
-                             uint my_work,
-                             uint tile_dim_x,
-                             uint tile_dim_y,
-                             uint tile_offset_x,
-                             uint tile_offset_y,
-                             uint parallel_samples,
+                             uint work_index,
                              uint ray_index)
 {
-	uint grp_idx = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           0);
-	uint grp_idy = get_group_id_with_ray_index(ray_index,
-	                                           tile_dim_x,
-	                                           tile_dim_y,
-	                                           parallel_samples,
-	                                           1);
-	uint threads_within_tile_border_x =
-		(grp_idx == (get_num_groups(0) - 1)) ? tile_dim_x % get_local_size(0)
-		                                     : get_local_size(0);
-	uint threads_within_tile_border_y =
-		(grp_idy == (get_num_groups(1) - 1)) ? tile_dim_y % get_local_size(1)
-		                                     : get_local_size(1);
-
-	threads_within_tile_border_x =
-		(threads_within_tile_border_x == 0) ? get_local_size(0)
-		                                    : threads_within_tile_border_x;
-	threads_within_tile_border_y =
-		(threads_within_tile_border_y == 0) ? get_local_size(1)
-		                                    : threads_within_tile_border_y;
-
-	uint total_associated_pixels =
-		threads_within_tile_border_x * threads_within_tile_border_y;
-	uint work_group_pixel_index = my_work % total_associated_pixels;
-	uint work_group_pixel_x =
-		work_group_pixel_index % threads_within_tile_border_x;
-	uint work_group_pixel_y =
-		work_group_pixel_index / threads_within_tile_border_x;
-
-	*pixel_x =
-		tile_offset_x + (grp_idx * get_local_size(0)) + work_group_pixel_x;
-	*pixel_y =
-		tile_offset_y + (grp_idy * get_local_size(1)) + work_group_pixel_y;
-	*tile_x = *pixel_x - tile_offset_x;
-	*tile_y = *pixel_y - tile_offset_y;
+	uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+
+	*tile_x = pixel_index % kernel_split_params.w;
+	*tile_y = pixel_index / kernel_split_params.w;
+
+	*pixel_x = *tile_x + kernel_split_params.x;
+	*pixel_y = *tile_y + kernel_split_params.y;
 }
 
-#endif  /* __WORK_STEALING__ */
+CCL_NAMESPACE_END
 
 #endif  /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/kernels/cpu/kernel.cpp
index 72dbbd9a416..16992c681e6 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel.cpp
@@ -56,9 +56,9 @@
     /* do nothing */
 #endif
 
-#include "kernel.h"
+#include "kernel/kernel.h"
 #define KERNEL_ARCH cpu
-#include "kernel_cpu_impl.h"
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -90,7 +90,7 @@ void kernel_tex_copy(KernelGlobals *kg,
 		kg->tname.width = width; \
 	}
 #define KERNEL_IMAGE_TEX(type, ttype, tname)
-#include "kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	else if(strstr(name, "__tex_image_float4")) {
 		texture_image_float4 *tex = NULL;
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
index 1350d9e5c2e..2600d977972 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -28,10 +28,10 @@
 #  define __KERNEL_AVX__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_avx
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
index 1a416e771ee..dba15d037ac 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -29,10 +29,10 @@
 #  define __KERNEL_AVX2__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_avx2
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 1a07c705f1c..896b80d783e 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -49,4 +49,44 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        int offset,
                                        int sample);
 
+/* Split kernels */
+
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+        KernelGlobals *kg,
+        ccl_constant KernelData *data,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        ccl_global uint *rng_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer);
+
+#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData *data);
+
+DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
+DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
+DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
+DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
+
+void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
+
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index ec82d4b4c22..148b2eef568 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -20,18 +20,45 @@
  * simply includes this file without worry of copying actual implementation over.
  */
 
-#include "kernel_compat_cpu.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_cpu_image.h"
-#include "kernel_film.h"
-#include "kernel_path.h"
-#include "kernel_path_branched.h"
-#include "kernel_bake.h"
+#include "kernel/kernel_compat_cpu.h"
+
+#ifndef __SPLIT_KERNEL__
+#  include "kernel/kernel_math.h"
+#  include "kernel/kernel_types.h"
+
+#  include "kernel/split/kernel_split_data.h"
+#  include "kernel/kernel_globals.h"
+
+#  include "kernel/kernels/cpu/kernel_cpu_image.h"
+#  include "kernel/kernel_film.h"
+#  include "kernel/kernel_path.h"
+#  include "kernel/kernel_path_branched.h"
+#  include "kernel/kernel_bake.h"
+#else
+#  include "kernel/split/kernel_split_common.h"
+
+#  include "kernel/split/kernel_data_init.h"
+#  include "kernel/split/kernel_path_init.h"
+#  include "kernel/split/kernel_scene_intersect.h"
+#  include "kernel/split/kernel_lamp_emission.h"
+#  include "kernel/split/kernel_do_volume.h"
+#  include "kernel/split/kernel_queue_enqueue.h"
+#  include "kernel/split/kernel_indirect_background.h"
+#  include "kernel/split/kernel_shader_eval.h"
+#  include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#  include "kernel/split/kernel_subsurface_scatter.h"
+#  include "kernel/split/kernel_direct_lighting.h"
+#  include "kernel/split/kernel_shadow_blocked_ao.h"
+#  include "kernel/split/kernel_shadow_blocked_dl.h"
+#  include "kernel/split/kernel_next_iteration_setup.h"
+#  include "kernel/split/kernel_indirect_subsurface.h"
+#  include "kernel/split/kernel_buffer_update.h"
+#endif
 
 CCL_NAMESPACE_BEGIN
 
+#ifndef __SPLIT_KERNEL__
+
 /* Path Tracing */
 
 void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
@@ -131,4 +158,72 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 	}
 }
 
+#else  /* __SPLIT_KERNEL__ */
+
+/* Split Kernel Path Tracing */
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		kernel_##name(kg); \
+	}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		ccl_local type locals; \
+		kernel_##name(kg, &locals); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+
+void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
+{
+#define REGISTER_NAME_STRING(name) #name
+#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
+#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
+
+	REGISTER(path_trace);
+	REGISTER(convert_to_byte);
+	REGISTER(convert_to_half_float);
+	REGISTER(shader);
+
+	REGISTER(data_init);
+	REGISTER(path_init);
+	REGISTER(scene_intersect);
+	REGISTER(lamp_emission);
+	REGISTER(do_volume);
+	REGISTER(queue_enqueue);
+	REGISTER(indirect_background);
+	REGISTER(shader_eval);
+	REGISTER(holdout_emission_blurring_pathtermination_ao);
+	REGISTER(subsurface_scatter);
+	REGISTER(direct_lighting);
+	REGISTER(shadow_blocked_ao);
+	REGISTER(shadow_blocked_dl);
+	REGISTER(next_iteration_setup);
+	REGISTER(indirect_subsurface);
+	REGISTER(buffer_update);
+
+#undef REGISTER
+#undef REGISTER_EVAL_NAME
+#undef REGISTER_NAME_STRING
+}
+
+#endif  /* __SPLIT_KERNEL__ */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
new file mode 100644
index 00000000000..ca750e5a00d
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
new file mode 100644
index 00000000000..27a746a0799
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+ 
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#  define __KERNEL_AVX__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_avx
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
new file mode 100644
index 00000000000..364d279a189
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE__
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#  define __KERNEL_AVX__
+#  define __KERNEL_AVX2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_avx2
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
new file mode 100644
index 00000000000..0afb481296f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_sse2
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
new file mode 100644
index 00000000000..13d00813591
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_sse3
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
new file mode 100644
index 00000000000..a4312071edc
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#  define __KERNEL_SSE2__
+#  define __KERNEL_SSE3__
+#  define __KERNEL_SSSE3__
+#  define __KERNEL_SSE41__
+#endif
+
+#define __SPLIT_KERNEL__
+
+#include "util/util_optimization.h"
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  include "kernel/kernel.h"
+#  define KERNEL_ARCH cpu_sse41
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
index a5f2d6e7294..1acfaa91ac9 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -23,10 +23,10 @@
 #  define __KERNEL_SSE2__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_sse2
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
index 86f9ce991f8..f7b6a2e21fe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -25,10 +25,10 @@
 #  define __KERNEL_SSSE3__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_sse3
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
index c174406047d..1900c6e3012 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -26,10 +26,10 @@
 #  define __KERNEL_SSE41__
 #endif
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel.h"
+#  include "kernel/kernel.h"
 #  define KERNEL_ARCH cpu_sse41
-#  include "kernel_cpu_impl.h"
+#  include "kernel/kernels/cpu//kernel_cpu_impl.h"
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index eb2b6ea5414..dc343cb387a 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,113 +16,19 @@
 
 /* CUDA kernel entry points */
 
-#include "../../kernel_compat_cuda.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_film.h"
-#include "../../kernel_path.h"
-#include "../../kernel_path_branched.h"
-#include "../../kernel_bake.h"
-
-/* device data taken from CUDA occupancy calculator */
-
 #ifdef __CUDA_ARCH__
 
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 32
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 48
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-	__launch_bounds__( \
-		threads_block_width*threads_block_width, \
-		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
-		)
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
+#include "kernel/kernel_compat_cuda.h"
+#include "kernel_config.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_path.h"
+#include "kernel/kernel_path_branched.h"
+#include "kernel/kernel_bake.h"
 
 /* kernels */
-
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
@@ -130,8 +36,10 @@ kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int s
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+	if(x < sx + sw && y < sy + sh) {
+		KernelGlobals kg;
+		kernel_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+	}
 }
 
 #ifdef __BRANCHED_PATH__
@@ -142,8 +50,10 @@ kernel_cuda_branched_path_trace(float *buffer, uint *rng_state, int sample, int
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
-		kernel_branched_path_trace(NULL, buffer, rng_state, sample, x, y, offset, stride);
+	if(x < sx + sw && y < sy + sh) {
+		KernelGlobals kg;
+		kernel_branched_path_trace(&kg, buffer, rng_state, sample, x, y, offset, stride);
+	}
 }
 #endif
 
@@ -154,8 +64,9 @@ kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
+	if(x < sx + sw && y < sy + sh) {
 		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+	}
 }
 
 extern "C" __global__ void
@@ -165,8 +76,9 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
 
-	if(x < sx + sw && y < sy + sh)
+	if(x < sx + sw && y < sy + sh) {
 		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+	}
 }
 
 extern "C" __global__ void
@@ -183,7 +95,8 @@ kernel_cuda_shader(uint4 *input,
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	if(x < sx + sw) {
-		kernel_shader_evaluate(NULL,
+		KernelGlobals kg;
+		kernel_shader_evaluate(&kg,
 		                       input,
 		                       output,
 		                       output_luma,
@@ -200,8 +113,10 @@ kernel_cuda_bake(uint4 *input, float4 *output, int type, int filter, int sx, int
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
-	if(x < sx + sw)
-		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+	if(x < sx + sw) {
+		KernelGlobals kg;
+		kernel_bake_evaluate(&kg, input, output, (ShaderEvalType)type, filter, x, offset, sample);
+	}
 }
 #endif
 
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
new file mode 100644
index 00000000000..9fa39dc9ebb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 32
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 63
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define CUDA_BLOCK_MAX_THREADS 1024
+#  define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define CUDA_THREADS_BLOCK_WIDTH 16
+#  define CUDA_KERNEL_MAX_REGISTERS 48
+#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+	__launch_bounds__( \
+		threads_block_width*threads_block_width, \
+		CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+		)
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
new file mode 100644
index 00000000000..a679eff8409
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "kernel/kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_data_init.h"
+#include "kernel/split/kernel_path_init.h"
+#include "kernel/split/kernel_scene_intersect.h"
+#include "kernel/split/kernel_lamp_emission.h"
+#include "kernel/split/kernel_do_volume.h"
+#include "kernel/split/kernel_queue_enqueue.h"
+#include "kernel/split/kernel_indirect_background.h"
+#include "kernel/split/kernel_shader_eval.h"
+#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "kernel/split/kernel_subsurface_scatter.h"
+#include "kernel/split/kernel_direct_lighting.h"
+#include "kernel/split/kernel_shadow_blocked_ao.h"
+#include "kernel/split/kernel_shadow_blocked_dl.h"
+#include "kernel/split/kernel_next_iteration_setup.h"
+#include "kernel/split/kernel_indirect_subsurface.h"
+#include "kernel/split/kernel_buffer_update.h"
+
+#include "kernel/kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
+{
+	*size = split_data_buffer_size(NULL, num_threads);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
+        ccl_global uint *rng_state,
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
+        ccl_global int *Queue_index,
+        int queuesize,
+        ccl_global char *use_queues_flag,
+        ccl_global unsigned int *work_pool_wgs,
+        unsigned int num_samples,
+        ccl_global float *buffer)
+{
+	kernel_data_init(NULL,
+	                 NULL,
+	                 split_data_buffer,
+	                 num_elements,
+	                 ray_state,
+	                 rng_state,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
+	                 Queue_index,
+	                 queuesize,
+	                 use_queues_flag,
+	                 work_pool_wgs,
+	                 num_samples,
+	                 buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		kernel_##name(NULL); \
+	}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
+	extern "C" __global__ void \
+	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+	kernel_cuda_##name() \
+	{ \
+		ccl_local type locals; \
+		kernel_##name(NULL, &locals); \
+	}
+
+DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_eval, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(subsurface_scatter, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
+DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
+DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+	if(x < sx + sw && y < sy + sh)
+		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index a68f97857b6..078acc1631e 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -16,34 +16,34 @@
 
 /* OpenCL kernel entry points - unfinished */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_image_opencl.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_image_opencl.h"
 
-#include "../../kernel_film.h"
+#include "kernel/kernel_film.h"
 
 #if defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__)
-#  include "../../kernel_path.h"
-#  include "../../kernel_path_branched.h"
+#  include "kernel/kernel_path.h"
+#  include "kernel/kernel_path_branched.h"
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
 /* Include only actually used headers for the case
  * when path tracing kernels are not needed.
  */
-#  include "../../kernel_random.h"
-#  include "../../kernel_differential.h"
-#  include "../../kernel_montecarlo.h"
-#  include "../../kernel_projection.h"
-#  include "../../geom/geom.h"
-#  include "../../bvh/bvh.h"
-
-#  include "../../kernel_accumulate.h"
-#  include "../../kernel_camera.h"
-#  include "../../kernel_shader.h"
+#  include "kernel/kernel_random.h"
+#  include "kernel/kernel_differential.h"
+#  include "kernel/kernel_montecarlo.h"
+#  include "kernel/kernel_projection.h"
+#  include "kernel/geom/geom.h"
+#  include "kernel/bvh/bvh.h"
+
+#  include "kernel/kernel_accumulate.h"
+#  include "kernel/kernel_camera.h"
+#  include "kernel/kernel_shader.h"
 #endif  /* defined(__COMPILE_ONLY_MEGAKERNEL__) || !defined(__NO_BAKING__) */
 
-#include "../../kernel_bake.h"
+#include "kernel/kernel_bake.h"
 
 #ifdef __COMPILE_ONLY_MEGAKERNEL__
 
@@ -54,7 +54,7 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	int sample,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -65,10 +65,10 @@ __kernel void kernel_ocl_path_trace(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
@@ -84,7 +84,7 @@ __kernel void kernel_ocl_shader(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	int type, int sx, int sw, int offset, int sample)
 {
@@ -94,9 +94,9 @@ __kernel void kernel_ocl_shader(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
 		kernel_shader_evaluate(kg,
@@ -116,7 +116,7 @@ __kernel void kernel_ocl_bake(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	int type, int filter, int sx, int sw, int offset, int sample)
 {
@@ -126,9 +126,9 @@ __kernel void kernel_ocl_bake(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
+	int x = sx + ccl_global_id(0);
 
 	if(x < sx + sw) {
 #ifdef __NO_BAKING__
@@ -146,7 +146,7 @@ __kernel void kernel_ocl_convert_to_byte(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -157,10 +157,10 @@ __kernel void kernel_ocl_convert_to_byte(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
@@ -173,7 +173,7 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
 	float sample_scale,
 	int sx, int sy, int sw, int sh, int offset, int stride)
@@ -184,13 +184,29 @@ __kernel void kernel_ocl_convert_to_half_float(
 
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	int x = sx + get_global_id(0);
-	int y = sy + get_global_id(1);
+	int x = sx + ccl_global_id(0);
+	int y = sy + ccl_global_id(1);
 
 	if(x < sx + sw && y < sy + sh)
 		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, ulong size, ulong offset)
+{
+	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	if(i < size / sizeof(float4)) {
+		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+	}
+	else if(i == size / sizeof(float4)) {
+		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
+
+		for(i = 0; i < size % sizeof(float4); i++) {
+			*(b++) = 0;
+		}
+	}
+}
+
 #endif  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
deleted file mode 100644
index 1914d241eb1..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background_buffer_update.cl
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_background_buffer_update.h"
-
-__kernel void kernel_ocl_path_trace_background_buffer_update(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        ccl_global int *Queue_data,            /* Queues memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
-{
-	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(ray_index == 0) {
-		/* We will empty this queue in this kernel. */
-		Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-	}
-	char enqueue_flag = 0;
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          1);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag =
-			kernel_background_buffer_update((KernelGlobals *)kg,
-			                                per_sample_output_buffers,
-			                                rng_state,
-			                                rng_coop,
-			                                throughput_coop,
-			                                PathRadiance_coop,
-			                                Ray_coop,
-			                                PathState_coop,
-			                                L_transparent_coop,
-			                                ray_state,
-			                                sw, sh, sx, sy, stride,
-			                                rng_state_offset_x,
-			                                rng_state_offset_y,
-			                                rng_state_stride,
-			                                work_array,
-			                                end_sample,
-			                                start_sample,
-#ifdef __WORK_STEALING__
-			                                work_pool_wgs,
-			                                num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-			                                debugdata_coop,
-#endif
-			                                parallel_samples,
-			                                ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	 * These rays will be made active during next SceneIntersectkernel.
-	 */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
new file mode 100644
index 00000000000..db65c91baf7
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_buffer_update.h"
+
+__kernel void kernel_ocl_path_trace_buffer_update(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	ccl_local unsigned int local_queue_atomics;
+	kernel_buffer_update((KernelGlobals*)kg, &local_queue_atomics);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
index 18139687eab..8b85d362f8a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
@@ -14,77 +14,49 @@
  * limitations under the License.
  */
 
-#include "split/kernel_data_init.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_data_init.h"
 
 __kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *globals,
-        ccl_global char *sd_DL_shadow,
+        ccl_global char *kg,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
         ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
 
 #define KERNEL_TEX(type, ttype, name)                                   \
         ccl_global type *name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
         ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
         unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global float *buffer)
 {
-	kernel_data_init((KernelGlobals *)globals,
-	                 (ShaderData *)sd_DL_shadow,
+	kernel_data_init((KernelGlobals*)kg,
 	                 data,
-	                 per_sample_output_buffers,
-	                 rng_state,
-	                 rng_coop,
-	                 throughput_coop,
-	                 L_transparent_coop,
-	                 PathRadiance_coop,
-	                 Ray_coop,
-	                 PathState_coop,
-	                 Intersection_coop_shadow,
+	                 split_data_buffer,
+	                 num_elements,
 	                 ray_state,
+	                 rng_state,
 
 #define KERNEL_TEX(type, ttype, name) name,
-#include "../../kernel_textures.h"
+#include "kernel/kernel_textures.h"
 
-	                 start_sample, sx, sy, sw, sh, offset, stride,
-	                 rng_state_offset_x,
-	                 rng_state_offset_y,
-	                 rng_state_stride,
-	                 Queue_data,
+	                 start_sample,
+	                 end_sample,
+	                 sx, sy, sw, sh, offset, stride,
 	                 Queue_index,
 	                 queuesize,
 	                 use_queues_flag,
-	                 work_array,
-#ifdef __WORK_STEALING__
 	                 work_pool_wgs,
 	                 num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-	                 debugdata_coop,
-#endif
-	                 parallel_samples);
+	                 buffer);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
index c6a2c8d050c..eb34f750881 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
@@ -14,74 +14,14 @@
  * limitations under the License.
  */
 
-#include "split/kernel_direct_lighting.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_direct_lighting.h"
 
 __kernel void kernel_ocl_path_trace_direct_lighting(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                    /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        ccl_global int *Queue_data,             /* Queue memory */
-        ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
-        int queuesize)                          /* Size (capacity) of each queue */
+        ccl_constant KernelData *data)
 {
 	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_direct_lighting((KernelGlobals *)kg,
-		                                      (ShaderData *)sd,
-		                                      rng_coop,
-		                                      PathState_coop,
-		                                      ISLamp_coop,
-		                                      LightRay_coop,
-		                                      BSDFEval_coop,
-		                                      ray_state,
-		                                      ray_index);
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-#ifdef __EMISSION__
-	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-#endif
+	kernel_direct_lighting((KernelGlobals*)kg, &local_queue_atomics);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
new file mode 100644
index 00000000000..83ef5f5f3f2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_do_volume.h"
+
+__kernel void kernel_ocl_path_trace_do_volume(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_do_volume((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
index e063614da1a..d071b39aa6f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
@@ -14,110 +14,16 @@
  * limitations under the License.
  */
 
-#include "split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
 
 __kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize,                         /* Size (capacity) of each queue */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        ccl_constant KernelData *data)
 {
-	ccl_local unsigned int local_queue_atomics_bg;
-	ccl_local unsigned int local_queue_atomics_ao;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics_bg = 0;
-		local_queue_atomics_ao = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	char enqueue_flag = 0;
-	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif  /* __COMPUTE_DEVICE_GPU__ */
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		kernel_holdout_emission_blurring_pathtermination_ao(
-		        (KernelGlobals *)kg,
-		        (ShaderData *)sd,
-		        per_sample_output_buffers,
-		        rng_coop,
-		        throughput_coop,
-		        L_transparent_coop,
-		        PathRadiance_coop,
-		        PathState_coop,
-		        Intersection_coop,
-		        AOAlpha_coop,
-		        AOBSDF_coop,
-		        AOLightRay_coop,
-		        sw, sh, sx, sy, stride,
-		        ray_state,
-		        work_array,
-#ifdef __WORK_STEALING__
-		        start_sample,
-#endif
-		        parallel_samples,
-		        ray_index,
-		        &enqueue_flag,
-		        &enqueue_flag_AO_SHADOW_RAY_CAST);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics_bg,
-	                        Queue_data,
-	                        Queue_index);
-
-#ifdef __AO__
-	/* Enqueue to-shadow-ray-cast rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-	                        enqueue_flag_AO_SHADOW_RAY_CAST,
-	                        queuesize,
-	                        &local_queue_atomics_ao,
-	                        Queue_data,
-	                        Queue_index);
-#endif
+	ccl_local BackgroundAOLocals locals;
+	kernel_holdout_emission_blurring_pathtermination_ao(
+	        (KernelGlobals*)kg,
+	        &locals);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
new file mode 100644
index 00000000000..8c213ff5cb2
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_indirect_background.h"
+
+__kernel void kernel_ocl_path_trace_indirect_background(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_indirect_background((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
new file mode 100644
index 00000000000..998ebc4c0c3
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_indirect_subsurface.h"
+
+__kernel void kernel_ocl_path_trace_indirect_subsurface(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_indirect_subsurface((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
index 267bddc2ffc..822d2287715 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
@@ -14,67 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_lamp_emission.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_lamp_emission.h"
 
 __kernel void kernel_ocl_path_trace_lamp_emission(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        ccl_constant KernelData *data)
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	/* We will empty this queue in this kernel. */
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-	}
-	/* Fetch use_queues_flag. */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          1);
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_lamp_emission((KernelGlobals *)kg,
-	                     throughput_coop,
-	                     PathRadiance_coop,
-	                     Ray_coop,
-	                     PathState_coop,
-	                     Intersection_coop,
-	                     ray_state,
-	                     sw, sh,
-	                     use_queues_flag,
-	                     ray_index);
+	kernel_lamp_emission((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
index 6d49b6294a8..6d207253a40 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
@@ -14,101 +14,14 @@
  * limitations under the License.
  */
 
-#include "split/kernel_next_iteration_setup.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_next_iteration_setup.h"
 
 __kernel void kernel_ocl_path_trace_next_iteration_setup(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                  /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global int *Queue_data,           /* Queue memory */
-        ccl_global int *Queue_index,          /* Tracks the number of elements in each queue */
-        int queuesize,                        /* Size (capacity) of each queue */
-        ccl_global char *use_queues_flag)     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
+        ccl_constant KernelData *data)
 {
 	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(get_global_id(0) == 0 && get_global_id(1) == 0) {
-		/* If we are here, then it means that scene-intersect kernel
-		* has already been executed atleast once. From the next time,
-		* scene-intersect kernel may operate on queues to fetch ray index
-		*/
-		use_queues_flag[0] = 1;
-
-		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-		 * previous kernel.
-		 */
-		Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-		Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-	}
-
-	char enqueue_flag = 0;
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-#ifdef __COMPUTE_DEVICE_GPU__
-	/* If we are executing on a GPU device, we exit all threads that are not
-	 * required.
-	 *
-	 * If we are executing on a CPU device, then we need to keep all threads
-	 * active since we have barrier() calls later in the kernel. CPU devices,
-	 * expect all threads to execute barrier statement.
-	 */
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-#endif
-
-#ifndef __COMPUTE_DEVICE_GPU__
-	if(ray_index != QUEUE_EMPTY_SLOT) {
-#endif
-		enqueue_flag = kernel_next_iteration_setup((KernelGlobals *)kg,
-		                                           (ShaderData *)sd,
-		                                           rng_coop,
-		                                           throughput_coop,
-		                                           PathRadiance_coop,
-		                                           Ray_coop,
-		                                           PathState_coop,
-		                                           LightRay_dl_coop,
-		                                           ISLamp_coop,
-		                                           BSDFEval_coop,
-		                                           LightRay_ao_coop,
-		                                           AOBSDF_coop,
-		                                           AOAlpha_coop,
-		                                           ray_state,
-		                                           use_queues_flag,
-		                                           ray_index);
-#ifndef __COMPUTE_DEVICE_GPU__
-	}
-#endif
-
-	/* Enqueue RAY_UPDATE_BUFFER rays. */
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
+	kernel_next_iteration_setup((KernelGlobals*)kg, &local_queue_atomics);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
new file mode 100644
index 00000000000..bd9aa9538c8
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_path_init.h"
+
+__kernel void kernel_ocl_path_trace_path_init(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_path_init((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
index 3156dc255fb..9be154e3d75 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
@@ -14,93 +14,14 @@
  * limitations under the License.
  */
 
-#include "../../kernel_compat_opencl.h"
-#include "../../kernel_math.h"
-#include "../../kernel_types.h"
-#include "../../kernel_globals.h"
-#include "../../kernel_queues.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_queue_enqueue.h"
 
-/*
- * The kernel "kernel_queue_enqueue" enqueues rays of
- * different ray state into their appropriate Queues;
- * 1. Rays that have been determined to hit the background from the
- * "kernel_scene_intersect" kernel
- * are enqueued in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in path-iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output of the kernel is as follows,
- *
- * ray_state -------------------------------------------|--- kernel_queue_enqueue --|--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                           |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS & QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                           |
- * queuesize -------------------------------------------|                           |
- *
- * Note on Queues :
- * State of queues during the first time this kernel is called :
- * At entry,
- * Both QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_HIT_BACKGROUND rays.
- *
- * State of queue during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
 __kernel void kernel_ocl_path_trace_queue_enqueue(
-        ccl_global int *Queue_data,   /* Queue memory */
-        ccl_global int *Queue_index,  /* Tracks the number of elements in each queue */
-        ccl_global char *ray_state,   /* Denotes the state of each ray */
-        int queuesize)                /* Size (capacity) of each queue */
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
 {
-	/* We have only 2 cases (Hit/Not-Hit) */
-	ccl_local unsigned int local_queue_atomics[2];
-
-	int lidx = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-
-	if(lidx < 2 ) {
-		local_queue_atomics[lidx] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int queue_number = -1;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-	}
-	else if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-	}
-
-	unsigned int my_lqidx;
-	if(queue_number != -1) {
-		my_lqidx = get_local_queue_index(queue_number, local_queue_atomics);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	if(lidx == 0) {
-		local_queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
-		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-		local_queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
-		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-		                                    local_queue_atomics,
-		                                    Queue_index);
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	unsigned int my_gqidx;
-	if(queue_number != -1) {
-		my_gqidx = get_global_queue_index(queue_number,
-		                                  queuesize,
-		                                  my_lqidx,
-		                                  local_queue_atomics);
-		Queue_data[my_gqidx] = ray_index;
-	}
+	ccl_local QueueEnqueueLocals locals;
+	kernel_queue_enqueue((KernelGlobals*)kg, &locals);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
index 7f3f433c7a6..eb4fb4d153a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
@@ -14,67 +14,13 @@
  * limitations under the License.
  */
 
-#include "split/kernel_scene_intersect.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_scene_intersect.h"
 
 __kernel void kernel_ocl_path_trace_scene_intersect(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global int *Queue_data,            /* Memory for queues */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in queues */
-        int queuesize,                         /* Size (capacity) of queues */
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                  /* Number of samples to be processed in parallel */
+        ccl_constant KernelData *data)
 {
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	/* Fetch use_queues_flag */
-	ccl_local char local_use_queues_flag;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_use_queues_flag = use_queues_flag[0];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index;
-	if(local_use_queues_flag) {
-		int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-		ray_index = get_ray_index(thread_index,
-		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-		                          Queue_data,
-		                          queuesize,
-		                          0);
-
-		if(ray_index == QUEUE_EMPTY_SLOT) {
-			return;
-		}
-	} else {
-		if(x < (sw * parallel_samples) && y < sh) {
-			ray_index = x + y * (sw * parallel_samples);
-		} else {
-			return;
-		}
-	}
-
-	kernel_scene_intersect((KernelGlobals *)kg,
-	                       rng_coop,
-	                       Ray_coop,
-	                       PathState_coop,
-	                       Intersection_coop,
-	                       ray_state,
-	                       sw, sh,
-	                       use_queues_flag,
-#ifdef __KERNEL_DEBUG__
-	                       debugdata_coop,
-#endif
-	                       ray_index);
+	kernel_scene_intersect((KernelGlobals*)kg);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
index c37856c8f30..6baee460986 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
@@ -14,55 +14,14 @@
  * limitations under the License.
  */
 
-#include "split/kernel_shader_eval.h"
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shader_eval.h"
 
 __kernel void kernel_ocl_path_trace_shader_eval(
         ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global char *sd,                   /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global int *Queue_data,            /* queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
+        ccl_constant KernelData *data)
 {
-	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
 	ccl_local unsigned int local_queue_atomics;
-	if(get_local_id(0) == 0 && get_local_id(1) == 0) {
-		local_queue_atomics = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	ray_index = get_ray_index(ray_index,
-	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-	                          Queue_data,
-	                          queuesize,
-	                          0);
-
-	if(ray_index == QUEUE_EMPTY_SLOT) {
-		return;
-	}
-
-	char enqueue_flag = (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 : 0;
-	enqueue_ray_index_local(ray_index,
-	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-	                        enqueue_flag,
-	                        queuesize,
-	                        &local_queue_atomics,
-	                        Queue_data,
-	                        Queue_index);
-
-	/* Continue on with shader evaluation. */
-	kernel_shader_eval((KernelGlobals *)kg,
-	                   (ShaderData *)sd,
-	                   rng_coop,
-	                   Ray_coop,
-	                   PathState_coop,
-	                   Intersection_coop,
-	                   ray_state,
-	                   ray_index);
+	kernel_shader_eval((KernelGlobals*)kg, &local_queue_atomics);
 }
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
deleted file mode 100644
index edf76fba714..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked.cl
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_shadow_blocked.h"
-
-__kernel void kernel_ocl_path_trace_shadow_blocked(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        ccl_global int *Queue_data,            /* Queue memory */
-        ccl_global int *Queue_index,           /* Tracks the number of elements in each queue */
-        int queuesize)                         /* Size (capacity) of each queue */
-{
-	int lidx = get_local_id(1) * get_local_id(0) + get_local_id(0);
-
-	ccl_local unsigned int ao_queue_length;
-	ccl_local unsigned int dl_queue_length;
-	if(lidx == 0) {
-		ao_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-		dl_queue_length = Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	/* flag determining if the current ray is to process shadow ray for AO or DL */
-	char shadow_blocked_type = -1;
-
-	int ray_index = QUEUE_EMPTY_SLOT;
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
-	if(thread_index < ao_queue_length + dl_queue_length) {
-		if(thread_index < ao_queue_length) {
-			ray_index = get_ray_index(thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_AO;
-		} else {
-			ray_index = get_ray_index(thread_index - ao_queue_length, QUEUE_SHADOW_RAY_CAST_DL_RAYS, Queue_data, queuesize, 1);
-			shadow_blocked_type = RAY_SHADOW_RAY_CAST_DL;
-		}
-	}
-
-	if(ray_index == QUEUE_EMPTY_SLOT)
-		return;
-
-	kernel_shadow_blocked((KernelGlobals *)kg,
-	                      PathState_coop,
-	                      LightRay_dl_coop,
-	                      LightRay_ao_coop,
-	                      ray_state,
-	                      shadow_blocked_type,
-	                      ray_index);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
new file mode 100644
index 00000000000..6a8ef81b32a
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shadow_blocked_ao.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked_ao(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_shadow_blocked_ao((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
new file mode 100644
index 00000000000..b255cc5ef8b
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_shadow_blocked_dl.h"
+
+__kernel void kernel_ocl_path_trace_shadow_blocked_dl(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	kernel_shadow_blocked_dl((KernelGlobals*)kg);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split.cl b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
new file mode 100644
index 00000000000..732cda30115
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_split.cl
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
+#include "kernel/kernels/opencl/kernel_data_init.cl"
+#include "kernel/kernels/opencl/kernel_path_init.cl"
+
+#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
+#include "kernel/kernels/opencl/kernel_lamp_emission.cl"
+#include "kernel/kernels/opencl/kernel_do_volume.cl"
+#include "kernel/kernels/opencl/kernel_indirect_background.cl"
+#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
+#include "kernel/kernels/opencl/kernel_shader_eval.cl"
+#include "kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl"
+#include "kernel/kernels/opencl/kernel_subsurface_scatter.cl"
+#include "kernel/kernels/opencl/kernel_direct_lighting.cl"
+#include "kernel/kernels/opencl/kernel_shadow_blocked_ao.cl"
+#include "kernel/kernels/opencl/kernel_shadow_blocked_dl.cl"
+#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
+#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
+#include "kernel/kernels/opencl/kernel_buffer_update.cl"
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
new file mode 100644
index 00000000000..c10ecc426c6
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+
+__kernel void kernel_ocl_path_trace_state_buffer_size(
+        ccl_global char *kg,
+        ccl_constant KernelData *data,
+        uint num_threads,
+        ccl_global uint64_t *size)
+{
+	((KernelGlobals*)kg)->data = data;
+	*size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+}
+
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
new file mode 100644
index 00000000000..7a1838e485f
--- /dev/null
+++ b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel/kernel_compat_opencl.h"
+#include "kernel/split/kernel_split_common.h"
+#include "kernel/split/kernel_subsurface_scatter.h"
+
+__kernel void kernel_ocl_path_trace_subsurface_scatter(
+        ccl_global char *kg,
+        ccl_constant KernelData *data)
+{
+	ccl_local unsigned int local_queue_atomics;
+	kernel_subsurface_scatter((KernelGlobals*)kg, &local_queue_atomics);
+}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
deleted file mode 100644
index 88a1ed830af..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	kernel_sum_all_radiance(data,
-	                        buffer,
-	                        per_sample_output_buffer,
-	                        parallel_samples,
-	                        sw, sh, stride,
-	                        buffer_offset_x,
-	                        buffer_offset_y,
-	                        buffer_stride,
-	                        start_sample);
-}
diff --git a/intern/cycles/kernel/osl/CMakeLists.txt b/intern/cycles/kernel/osl/CMakeLists.txt
index 98de40e5a8a..d2eb89e0e0a 100644
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -1,12 +1,6 @@
 
 set(INC
-	.
-	..
-	../svm
-	../../graph
-	../../render
-	../../util
-	../../device
+	../..
 )
 
 set(INC_SYS
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index d835f9be45c..2e73e7a601e 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -34,10 +34,10 @@
 
 #include <OSL/genclosure.h>
 
-#include "osl_closures.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_compat_cpu.h"
-#include "closure/alloc.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/closure/alloc.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index bc26f42b559..ea18f2c8c86 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,13 +34,13 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
-#include "closure/alloc.h"
-#include "closure/bsdf_diffuse_ramp.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse_ramp.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index 14c7644936e..a26671eb09e 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,12 +34,12 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "closure/alloc.h"
-#include "closure/bsdf_phong_ramp.h"
+#include "kernel/kernel_types.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_phong_ramp.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index 3f13e08b302..8843a196dad 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -34,12 +34,12 @@
 
 #include <OSL/genclosure.h>
 
-#include "osl_closures.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_compat_cpu.h"
-#include "kernel_types.h"
-#include "closure/alloc.h"
-#include "closure/emissive.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_types.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/emissive.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index 44daefee249..188c3960a5f 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,17 +32,17 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel_compat_cpu.h"
-#include "osl_closures.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/osl/osl_closures.h"
 
-#include "kernel_types.h"
-#include "kernel_montecarlo.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_montecarlo.h"
 
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf_diffuse.h"
-#include "closure/bsdf_principled_diffuse.h"
-#include "closure/bssrdf.h"
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -81,7 +81,7 @@ public:
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
 				bssrdf->roughness = params.roughness;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
 			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -93,7 +93,7 @@ public:
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
 				bssrdf->roughness = params.roughness;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 
 			bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -105,7 +105,7 @@ public:
 				bssrdf->sharpness = sharpness;
 				bssrdf->N = params.N;
 				bssrdf->roughness = params.roughness;
-				ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+				sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 			}
 		}
 	}
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 5570a22692e..5b66793a05d 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -33,35 +33,36 @@
 #include <OSL/genclosure.h>
 #include <OSL/oslclosure.h>
 
-#include "osl_closures.h"
-#include "osl_shader.h"
-
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_param.h"
-
-#include "kernel_types.h"
-#include "kernel_compat_cpu.h"
-#include "kernel_globals.h"
-#include "kernel_montecarlo.h"
-#include "kernel_random.h"
-
-#include "closure/alloc.h"
-#include "closure/bsdf_util.h"
-#include "closure/bsdf_ashikhmin_velvet.h"
-#include "closure/bsdf_diffuse.h"
-#include "closure/bsdf_microfacet.h"
-#include "closure/bsdf_microfacet_multi.h"
-#include "closure/bsdf_oren_nayar.h"
-#include "closure/bsdf_reflection.h"
-#include "closure/bsdf_refraction.h"
-#include "closure/bsdf_transparent.h"
-#include "closure/bsdf_ashikhmin_shirley.h"
-#include "closure/bsdf_toon.h"
-#include "closure/bsdf_hair.h"
-#include "closure/bsdf_principled_diffuse.h"
-#include "closure/bsdf_principled_sheen.h"
-#include "closure/volume.h"
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_shader.h"
+
+#include "util/util_debug.h"
+#include "util/util_math.h"
+#include "util/util_param.h"
+
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_util.h"
+#include "kernel/closure/bsdf_ashikhmin_velvet.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_microfacet.h"
+#include "kernel/closure/bsdf_microfacet_multi.h"
+#include "kernel/closure/bsdf_oren_nayar.h"
+#include "kernel/closure/bsdf_reflection.h"
+#include "kernel/closure/bsdf_refraction.h"
+#include "kernel/closure/bsdf_transparent.h"
+#include "kernel/closure/bsdf_ashikhmin_shirley.h"
+#include "kernel/closure/bsdf_toon.h"
+#include "kernel/closure/bsdf_hair.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bsdf_principled_sheen.h"
+#include "kernel/closure/volume.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/osl/osl_closures.h b/intern/cycles/kernel/osl/osl_closures.h
index c9740f81c8a..ff5fd9cc905 100644
--- a/intern/cycles/kernel/osl/osl_closures.h
+++ b/intern/cycles/kernel/osl/osl_closures.h
@@ -33,8 +33,8 @@
 #ifndef __OSL_CLOSURES_H__
 #define __OSL_CLOSURES_H__
 
-#include "util_types.h"
-#include "kernel_types.h"
+#include "util/util_types.h"
+#include "kernel/kernel_types.h"
 
 #include <OSL/oslclosure.h>
 #include <OSL/oslexec.h>
diff --git a/intern/cycles/kernel/osl/osl_globals.h b/intern/cycles/kernel/osl/osl_globals.h
index 65cb7ecc6b4..02c083a83f8 100644
--- a/intern/cycles/kernel/osl/osl_globals.h
+++ b/intern/cycles/kernel/osl/osl_globals.h
@@ -21,10 +21,10 @@
 
 #include <OSL/oslexec.h>
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 #ifndef WIN32
 using std::isfinite;
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 58bbdc33920..b767c60c617 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -25,33 +25,34 @@
 
 #include <string.h>
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-
-#include "osl_closures.h"
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_string.h"
-
-#include "kernel_compat_cpu.h"
-#include "kernel_globals.h"
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_differential.h"
-#include "kernel_montecarlo.h"
-#include "kernel_camera.h"
-#include "kernels/cpu/kernel_cpu_image.h"
-#include "geom/geom.h"
-#include "bvh/bvh.h"
-
-#include "kernel_projection.h"
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_string.h"
+
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernels/cpu/kernel_cpu_image.h"
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
+
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
 
 #ifdef WITH_PTEX
 #  include <Ptexture.h>
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 0d762bbdb38..13b19d86eca 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -16,21 +16,22 @@
 
 #include <OSL/oslexec.h>
 
-#include "kernel_compat_cpu.h"
-#include "kernel_montecarlo.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
+#include "kernel/kernel_compat_cpu.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_types.h"
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
 
-#include "geom/geom_object.h"
+#include "kernel/geom/geom_object.h"
 
-#include "osl_closures.h"
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
+#include "kernel/osl/osl_closures.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
 
-#include "util_foreach.h"
+#include "util/util_foreach.h"
 
-#include "attribute.h"
+#include "render/attribute.h"
 
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index ad06dd6929d..32121e940b4 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -29,7 +29,7 @@
  * This means no thread state must be passed along in the kernel itself.
  */
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
deleted file mode 100644
index 9bfa71c75ef..00000000000
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_background_buffer_update kernel.
- * This is the fourth kernel in the ray tracing logic, and the third
- * of the path iteration kernels. This kernel takes care of rays that hit
- * the background (sceneintersect kernel), and for the rays of
- * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
- * the output buffer. This kernel also takes care of rays that have been determined
- * to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
- * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * The input and output are as follows,
- *
- * rng_coop ---------------------------------------------|--- kernel_background_buffer_update --|--- PathRadiance_coop
- * throughput_coop --------------------------------------|                                      |--- L_transparent_coop
- * per_sample_output_buffers ----------------------------|                                      |--- per_sample_output_buffers
- * Ray_coop ---------------------------------------------|                                      |--- ray_state
- * PathState_coop ---------------------------------------|                                      |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * L_transparent_coop -----------------------------------|                                      |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * ray_state --------------------------------------------|                                      |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                      |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                      |--- work_array
- * parallel_samples -------------------------------------|                                      |--- PathState_coop
- * end_sample -------------------------------------------|                                      |--- throughput_coop
- * kg (globals) -----------------------------------------|                                      |--- rng_coop
- * rng_state --------------------------------------------|                                      |--- Ray
- * PathRadiance_coop ------------------------------------|                                      |
- * sw ---------------------------------------------------|                                      |
- * sh ---------------------------------------------------|                                      |
- * sx ---------------------------------------------------|                                      |
- * sy ---------------------------------------------------|                                      |
- * stride -----------------------------------------------|                                      |
- * work_array -------------------------------------------|                                      |--- work_array
- * queuesize --------------------------------------------|                                      |
- * start_sample -----------------------------------------|                                      |--- work_pool_wgs
- * work_pool_wgs ----------------------------------------|                                      |
- * num_samples ------------------------------------------|                                      |
- *
- * note on sd : sd argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
- * Note on Queues :
- * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- *
- * State of queues when this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
- */
-ccl_device char kernel_background_buffer_update(
-        KernelGlobals *kg,
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,             /* Required for buffer Update */
-        ccl_global float3 *throughput_coop,    /* Required for background hit processing */
-        PathRadiance *PathRadiance_coop,       /* Required for background hit processing and buffer Update */
-        ccl_global Ray *Ray_coop,              /* Required for background hit processing */
-        ccl_global PathState *PathState_coop,  /* Required for background hit processing */
-        ccl_global float *L_transparent_coop,  /* Required for background hit processing and buffer Update */
-        ccl_global char *ray_state,            /* Stores information on the current state of a ray */
-        int sw, int sh, int sx, int sy, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global unsigned int *work_array,   /* Denotes work of each ray */
-        int end_sample,
-        int start_sample,
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index)
-{
-	char enqueue_flag = 0;
-#ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
-#endif
-	ccl_global PathState *state = &PathState_coop[ray_index];
-	PathRadiance *L = L = &PathRadiance_coop[ray_index];
-	ccl_global Ray *ray = &Ray_coop[ray_index];
-	ccl_global float3 *throughput = &throughput_coop[ray_index];
-	ccl_global float *L_transparent = &L_transparent_coop[ray_index];
-	ccl_global uint *rng = &rng_coop[ray_index];
-
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
-	ccl_global float *initial_per_sample_output_buffers;
-	ccl_global uint *initial_rng;
-#endif
-	unsigned int sample;
-	unsigned int tile_x;
-	unsigned int tile_y;
-	unsigned int pixel_x;
-	unsigned int pixel_y;
-	unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-	my_work = work_array[ray_index];
-	sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-	get_pixel_tile_position(&pixel_x, &pixel_y,
-	                        &tile_x, &tile_y,
-	                        my_work,
-	                        sw, sh, sx, sy,
-	                        parallel_samples,
-	                        ray_index);
-	my_sample_tile = 0;
-	initial_per_sample_output_buffers = per_sample_output_buffers;
-	initial_rng = rng_state;
-#else  /* __WORK_STEALING__ */
-	sample = work_array[ray_index];
-	int tile_index = ray_index / parallel_samples;
-	/* buffer and rng_state's stride is "stride". Find x and y using ray_index */
-	tile_x = tile_index % sw;
-	tile_y = tile_index / sw;
-	my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-
-	rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-	per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-
-	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-		/* eval background shader if nothing hit */
-		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
-			*L_transparent = (*L_transparent) + average((*throughput));
-#ifdef __PASSES__
-			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-		}
-
-		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-#ifdef __BACKGROUND__
-			/* sample background shader */
-			float3 L_background = indirect_background(kg, kg->sd_input, state, ray);
-			path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
-#endif
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-		}
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
-#ifdef __KERNEL_DEBUG__
-		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
-#endif
-		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
-
-		/* accumulate result in output buffer */
-		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-		path_rng_end(kg, rng_state, *rng);
-
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-	}
-
-	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-		/* We have completed current work; So get next work */
-		int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		if(!valid_work) {
-			/* If work is invalid, this means no more work is available and the thread may exit */
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#else  /* __WORK_STEALING__ */
-		if((sample + parallel_samples) >= end_sample) {
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-		}
-#endif  /* __WORK_STEALING__ */
-
-		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-#ifdef __WORK_STEALING__
-			work_array[ray_index] = my_work;
-			/* Get the sample associated with the current work */
-			sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-			/* Get pixel and tile position associated with current work */
-			get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
-			my_sample_tile = 0;
-
-			/* Remap rng_state according to the current work */
-			rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
-			/* Remap per_sample_output_buffers according to the current work */
-			per_sample_output_buffers = initial_per_sample_output_buffers
-				+ (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
-#else  /* __WORK_STEALING__ */
-			work_array[ray_index] = sample + parallel_samples;
-			sample = work_array[ray_index];
-
-			/* Get ray position from ray index */
-			pixel_x = sx + ((ray_index / parallel_samples) % sw);
-			pixel_y = sy + ((ray_index / parallel_samples) / sw);
-#endif  /* __WORK_STEALING__ */
-
-			/* Initialize random numbers and ray. */
-			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
-
-			if(ray->t != 0.0f) {
-				/* Initialize throughput, L_transparent, Ray, PathState;
-				 * These rays proceed with path-iteration.
-				 */
-				*throughput = make_float3(1.0f, 1.0f, 1.0f);
-				*L_transparent = 0.0f;
-				path_radiance_init(L, kernel_data.film.use_light_pass);
-				path_state_init(kg, kg->sd_input, state, rng, sample, ray);
-#ifdef __KERNEL_DEBUG__
-				debug_data_init(debug_data);
-#endif
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-				enqueue_flag = 1;
-			}
-			else {
-				/* These rays do not participate in path-iteration. */
-				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-				/* Accumulate result in output buffer. */
-				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
-				path_rng_end(kg, rng_state, *rng);
-
-				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-			}
-		}
-	}
-	return enqueue_flag;
-}
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
new file mode 100644
index 00000000000..859c221d976
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel takes care of rays that hit the background (sceneintersect
+ * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
+ * accumulated radiance in the output buffer. This kernel also takes care of
+ * rays that have been determined to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel.
+ * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
+ * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queues when this kernel is called:
+ * At entry,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
+ * At exit,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
+ */
+ccl_device void kernel_buffer_update(KernelGlobals *kg,
+                                     ccl_local_param unsigned int *local_queue_atomics)
+{
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(ray_index == 0) {
+		/* We will empty this queue in this kernel. */
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+	char enqueue_flag = 0;
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	ccl_global uint *rng_state = kernel_split_params.rng_state;
+	int stride = kernel_split_params.stride;
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+#ifdef __KERNEL_DEBUG__
+	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
+#endif
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	ccl_global float *buffer = kernel_split_params.buffer;
+
+	unsigned int work_index;
+	ccl_global uint *initial_rng;
+
+	unsigned int sample;
+	unsigned int tile_x;
+	unsigned int tile_y;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+
+	work_index = kernel_split_state.work_array[ray_index];
+	sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
+	                        &tile_x, &tile_y,
+	                        work_index,
+	                        ray_index);
+	initial_rng = rng_state;
+
+	rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
+			L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
+		}
+		else
+#endif  /* __SHADOW_TRICKS__ */
+		{
+			L_sum = path_radiance_clamp_and_sum(kg, L);
+		}
+		kernel_write_light_passes(kg, buffer, L, sample);
+#ifdef __KERNEL_DEBUG__
+		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
+#endif
+		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+		/* accumulate result in output buffer */
+		kernel_write_pass_float4(buffer, sample, L_rad);
+		path_rng_end(kg, rng_state, rng);
+
+		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+
+	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+		/* We have completed current work; So get next work */
+		int valid_work = get_next_work(kg, &work_index, ray_index);
+		if(!valid_work) {
+			/* If work is invalid, this means no more work is available and the thread may exit */
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+			kernel_split_state.work_array[ray_index] = work_index;
+			/* Get the sample associated with the current work */
+			sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+			/* Get pixel and tile position associated with current work */
+			get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
+
+			/* Remap rng_state according to the current work */
+			rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
+			/* Remap buffer according to the current work */
+			buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+
+			/* Initialize random numbers and ray. */
+			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng, ray);
+
+			if(ray->t != 0.0f) {
+				/* Initialize throughput, L_transparent, Ray, PathState;
+				 * These rays proceed with path-iteration.
+				 */
+				*throughput = make_float3(1.0f, 1.0f, 1.0f);
+				*L_transparent = 0.0f;
+				path_radiance_init(L, kernel_data.film.use_light_pass);
+				path_state_init(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &rng, sample, ray);
+#ifdef __SUBSURFACE__
+				kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+#ifdef __KERNEL_DEBUG__
+				debug_data_init(debug_data);
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+				enqueue_flag = 1;
+			}
+			else {
+				/* These rays do not participate in path-iteration. */
+				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+				/* Accumulate result in output buffer. */
+				kernel_write_pass_float4(buffer, sample, L_rad);
+				path_rng_end(kg, rng_state, rng);
+
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+			}
+		}
+	}
+	kernel_split_state.rng[ray_index] = rng;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	 * These rays will be made active during next SceneIntersectkernel.
+	 */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 6e158d53d23..9d3d01fff75 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -14,108 +14,105 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_data_initialization kernel
- * This kernel Initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
+/* This kernel Initializes structures needed in path-iteration kernels.
  *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- *
- * Its input and output are as follows,
- *
- * Un-initialized rng---------------|--- kernel_data_initialization ---|--- Initialized rng
- * Un-initialized throughput -------|                                  |--- Initialized throughput
- * Un-initialized L_transparent ----|                                  |--- Initialized L_transparent
- * Un-initialized PathRadiance -----|                                  |--- Initialized PathRadiance
- * Un-initialized Ray --------------|                                  |--- Initialized Ray
- * Un-initialized PathState --------|                                  |--- Initialized PathState
- * Un-initialized QueueData --------|                                  |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
- * Un-initialized QueueIndex -------|                                  |--- Initialized QueueIndex (to 0)
- * Un-initialized use_queues_flag---|                                  |--- Initialized use_queues_flag (to false)
- * Un-initialized ray_state --------|                                  |--- Initialized ray_state
- * parallel_samples --------------- |                                  |--- Initialized per_sample_output_buffers
- * rng_state -----------------------|                                  |--- Initialized work_array
- * data ----------------------------|                                  |--- Initialized work_pool_wgs
- * start_sample --------------------|                                  |
- * sx ------------------------------|                                  |
- * sy ------------------------------|                                  |
- * sw ------------------------------|                                  |
- * sh ------------------------------|                                  |
- * stride --------------------------|                                  |
- * queuesize -----------------------|                                  |
- * num_samples ---------------------|                                  |
- *
- * Note on Queues :
+ * Note on Queues:
  * All slots in queues are initialized to queue empty slot;
  * The number of elements in the queues is initialized to 0;
  */
+
+/* Distributes an amount of work across all threads
+ * note: work done inside the loop may not show up to all threads till after
+ * the current kernel has completed
+ */
+#define parallel_for(kg, iter_name, work_size) \
+	for(size_t _size = (work_size), \
+	    _global_size = ccl_global_size(0) * ccl_global_size(1), \
+	    _n = _size / _global_size, \
+		_thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \
+	    iter_name = (_n > 0) ? (_thread * _n) : (_thread) \
+		; \
+		(iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \
+		; \
+		iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \
+	)
+
+#ifndef __KERNEL_CPU__
 ccl_device void kernel_data_init(
+#else
+void KERNEL_FUNCTION_FULL_NAME(data_init)(
+#endif
         KernelGlobals *kg,
-        ShaderData *sd_DL_shadow,
         ccl_constant KernelData *data,
-        ccl_global float *per_sample_output_buffers,
+        ccl_global void *split_data_buffer,
+        int num_elements,
+        ccl_global char *ray_state,
         ccl_global uint *rng_state,
-        ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
-        ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
-        ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
-        PathRadiance *PathRadiance_coop,             /* PathRadiance array to store PathRadiance values for all rays */
-        ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
-        ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
-        Intersection *Intersection_coop_shadow,
-        ccl_global char *ray_state,                  /* Stores information on current state of a ray */
 
+#ifdef __KERNEL_OPENCL__
 #define KERNEL_TEX(type, ttype, name)                                   \
         ccl_global type *name,
-#include "../kernel_textures.h"
+#include "kernel/kernel_textures.h"
+#endif
 
-        int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
-        int rng_state_offset_x,
-        int rng_state_offset_y,
-        int rng_state_stride,
-        ccl_global int *Queue_data,                  /* Memory for queues */
+        int start_sample,
+        int end_sample,
+        int sx, int sy, int sw, int sh, int offset, int stride,
         ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
         int queuesize,                               /* size (capacity) of the queue */
         ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
-#ifdef __WORK_STEALING__
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-#endif
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int parallel_samples)                        /* Number of samples to be processed in parallel */
+        ccl_global unsigned int *work_pools,      /* Work pool for each work group */
+        unsigned int num_samples,
+        ccl_global float *buffer)
 {
+#ifdef __KERNEL_OPENCL__
 	kg->data = data;
-	kg->sd_input = sd_DL_shadow;
-	kg->isect_shadow = Intersection_coop_shadow;
+#endif
+
+	kernel_split_params.x = sx;
+	kernel_split_params.y = sy;
+	kernel_split_params.w = sw;
+	kernel_split_params.h = sh;
+
+	kernel_split_params.offset = offset;
+	kernel_split_params.stride = stride;
+
+	kernel_split_params.rng_state = rng_state;
+
+	kernel_split_params.start_sample = start_sample;
+	kernel_split_params.end_sample = end_sample;
+
+	kernel_split_params.work_pools = work_pools;
+	kernel_split_params.num_samples = num_samples;
+
+	kernel_split_params.queue_index = Queue_index;
+	kernel_split_params.queue_size = queuesize;
+	kernel_split_params.use_queues_flag = use_queues_flag;
+
+	kernel_split_params.buffer = buffer;
+
+	split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
+
+#ifdef __KERNEL_OPENCL__
 #define KERNEL_TEX(type, ttype, name) \
 	kg->name = name;
-#include "../kernel_textures.h"
-
-	int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+#include "kernel/kernel_textures.h"
+#endif
 
-#ifdef __WORK_STEALING__
-	int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
-	/* Initialize work_pool_wgs */
-	if(lid == 0) {
-		int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
-		work_pool_wgs[group_index] = 0;
-	}
-	barrier(CLK_LOCAL_MEM_FENCE);
-#endif  /* __WORK_STEALING__ */
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
 
 	/* Initialize queue data and queue index. */
 	if(thread_index < queuesize) {
 		/* Initialize active ray queue. */
-		Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize background and buffer update queue. */
-		Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize shadow ray cast of AO queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 		/* Initialize shadow ray cast of direct lighting queue. */
-		Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+		kernel_split_state.queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
 	}
 
 	if(thread_index == 0) {
@@ -126,109 +123,31 @@ ccl_device void kernel_data_init(
 		/* The scene-intersect kernel should not use the queues very first time.
 		 * since the queue would be empty.
 		 */
-		use_queues_flag[0] = 0;
+		*use_queues_flag = 0;
 	}
 
-	int x = get_global_id(0);
-	int y = get_global_id(1);
+	/* zero the tiles pixels and initialize rng_state if this is the first sample */
+	if(start_sample == 0) {
+		parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
+			int pixel = i / kernel_data.film.pass_stride;
+			int pass = i % kernel_data.film.pass_stride;
 
-	if(x < (sw * parallel_samples) && y < sh) {
-		int ray_index = x + y * (sw * parallel_samples);
+			int x = sx + pixel % sw;
+			int y = sy + pixel / sw;
 
-		/* This is the first assignment to ray_state;
-		 * So we dont use ASSIGN_RAY_STATE macro.
-		 */
-		ray_state[ray_index] = RAY_ACTIVE;
-
-		unsigned int my_sample;
-		unsigned int pixel_x;
-		unsigned int pixel_y;
-		unsigned int tile_x;
-		unsigned int tile_y;
-		unsigned int my_sample_tile;
-
-#ifdef __WORK_STEALING__
-		unsigned int my_work = 0;
-		/* Get work. */
-		get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
-		/* Get the sample associated with the work. */
-		my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-
-		my_sample_tile = 0;
-
-		/* Get pixel and tile position associated with the work. */
-		get_pixel_tile_position(&pixel_x, &pixel_y,
-		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
-		                        ray_index);
-		work_array[ray_index] = my_work;
-#else  /* __WORK_STEALING__ */
-		unsigned int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-		my_sample = my_sample_tile + start_sample;
-
-		/* Initialize work array. */
-		work_array[ray_index] = my_sample ;
-
-		/* Calculate pixel position of this ray. */
-		pixel_x = sx + tile_x;
-		pixel_y = sy + tile_y;
-#endif  /* __WORK_STEALING__ */
-
-		rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
-
-		/* Initialise per_sample_output_buffers to all zeros. */
-		per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
-		int per_sample_output_buffers_iterator = 0;
-		for(per_sample_output_buffers_iterator = 0;
-		    per_sample_output_buffers_iterator < kernel_data.film.pass_stride;
-		    per_sample_output_buffers_iterator++)
-		{
-			per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
-		}
+			int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
 
-		/* Initialize random numbers and ray. */
-		kernel_path_trace_setup(kg,
-		                        rng_state,
-		                        my_sample,
-		                        pixel_x, pixel_y,
-		                        &rng_coop[ray_index],
-		                        &Ray_coop[ray_index]);
-
-		if(Ray_coop[ray_index].t != 0.0f) {
-			/* Initialize throughput, L_transparent, Ray, PathState;
-			 * These rays proceed with path-iteration.
-			 */
-			throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-			L_transparent_coop[ray_index] = 0.0f;
-			path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
-			path_state_init(kg,
-			                kg->sd_input,
-			                &PathState_coop[ray_index],
-			                &rng_coop[ray_index],
-			                my_sample,
-			                &Ray_coop[ray_index]);
-#ifdef __KERNEL_DEBUG__
-			debug_data_init(&debugdata_coop[ray_index]);
-#endif
+			*(buffer + index) = 0.0f;
 		}
-		else {
-			/* These rays do not participate in path-iteration. */
-			float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-			/* Accumulate result in output buffer. */
-			kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
-			path_rng_end(kg, rng_state, rng_coop[ray_index]);
-			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-		}
-	}
 
-	/* Mark rest of the ray-state indices as RAY_INACTIVE. */
-	if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
-		/* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
-		ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+		parallel_for(kg, i, sw * sh) {
+			int x = sx + i % sw;
+			int y = sy + i / sw;
+
+			int index = (offset + x + y*stride);
+			*(rng_state + index) = hash_int_2d(x, y);
+		}
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
index 82ca18829d3..bdbf7387b95 100644
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ b/intern/cycles/kernel/split/kernel_direct_lighting.h
@@ -14,95 +14,144 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_direct_lighting kernel.
- * This is the eighth kernel in the ray tracing logic. This is the seventh
- * of the path iteration kernels. This kernel takes care of direct lighting
- * logic. However, the "shadow ray cast" part of direct lighting is handled
+/* This kernel takes care of direct lighting logic.
+ * However, the "shadow ray cast" part of direct lighting is handled
  * in the next kernel.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
- * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with direct lighting should be executed. Those rays for which
+ * a shadow_blocked() function for direct-lighting must be executed, are
+ * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
+ * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
+ * the corresponding shadow_blocked part, after direct lighting, the ray is
+ * marked with RAY_SHADOW_RAY_CAST_DL flag.
  *
- * rng_coop -----------------------------------------|--- kernel_direct_lighting --|--- BSDFEval_coop
- * PathState_coop -----------------------------------|                             |--- ISLamp_coop
- * sd -----------------------------------------------|                             |--- LightRay_coop
- * ray_state ----------------------------------------|                             |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                             |
- * kg (globals) -------------------------------------|                             |
- * queuesize ----------------------------------------|                             |
- *
- * Note on Queues :
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
- * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
- * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ * State of queues when this kernel is called:
+ * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
+ *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
+ *   kernel call.
+ * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
+ *   shadow_blocked function must be executed, after this kernel call
+ *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
  */
-ccl_device char kernel_direct_lighting(
-        KernelGlobals *kg,
-        ShaderData *sd,                         /* Required for direct lighting */
-        ccl_global uint *rng_coop,              /* Required for direct lighting */
-        ccl_global PathState *PathState_coop,   /* Required for direct lighting */
-        ccl_global int *ISLamp_coop,            /* Required for direct lighting */
-        ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
-        ccl_global char *ray_state,             /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_direct_lighting(KernelGlobals *kg,
+                                       ccl_local_param unsigned int *local_queue_atomics)
 {
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
 	char enqueue_flag = 0;
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global PathState *state = &PathState_coop[ray_index];
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
 
 		/* direct lighting */
 #ifdef __EMISSION__
-		if((kernel_data.integrator.use_direct_light &&
-		    (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL)))
-		{
+		RNG rng = kernel_split_state.rng[ray_index];
+		bool flag = (kernel_data.integrator.use_direct_light &&
+		             (sd->flag & SD_BSDF_HAS_EVAL));
+#  ifdef __SHADOW_TRICKS__
+		if(flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
+			flag = false;
+			ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+			float3 throughput = kernel_split_state.throughput[ray_index];
+			PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+			kernel_branched_path_surface_connect_light(kg,
+			                                           &rng,
+			                                           sd,
+			                                           emission_sd,
+			                                           state,
+			                                           throughput,
+			                                           1.0f,
+			                                           L,
+			                                           1);
+		}
+#  endif  /* __SHADOW_TRICKS__ */
+		if(flag) {
 			/* Sample illumination from lights to find path contribution. */
-			ccl_global RNG* rng = &rng_coop[ray_index];
-			float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+			float light_t = path_state_rng_1D(kg, &rng, state, PRNG_LIGHT);
 			float light_u, light_v;
-			path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
-			float terminate = path_state_rng_light_termination(kg, rng, state);
+			path_state_rng_2D(kg, &rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+			float terminate = path_state_rng_light_termination(kg, &rng, state);
 
 			LightSample ls;
 			if(light_sample(kg,
 			                light_t, light_u, light_v,
-			                ccl_fetch(sd, time),
-			                ccl_fetch(sd, P),
+			                sd->time,
+			                sd->P,
 			                state->bounce,
 			                &ls)) {
 
 				Ray light_ray;
-#ifdef __OBJECT_MOTION__
-				light_ray.time = ccl_fetch(sd, time);
-#endif
+#  ifdef __OBJECT_MOTION__
+				light_ray.time = sd->time;
+#  endif
 
 				BsdfEval L_light;
 				bool is_lamp;
-				if(direct_emission(kg, sd, kg->sd_input, &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
+				if(direct_emission(kg, sd, &kernel_split_state.sd_DL_shadow[ray_index], &ls, state, &light_ray, &L_light, &is_lamp, terminate)) {
 					/* Write intermediate data to global memory to access from
 					 * the next kernel.
 					 */
-					LightRay_coop[ray_index] = light_ray;
-					BSDFEval_coop[ray_index] = L_light;
-					ISLamp_coop[ray_index] = is_lamp;
+					kernel_split_state.light_ray[ray_index] = light_ray;
+					kernel_split_state.bsdf_eval[ray_index] = L_light;
+					kernel_split_state.is_lamp[ray_index] = is_lamp;
 					/* Mark ray state for next shadow kernel. */
-					ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+					ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 					enqueue_flag = 1;
 				}
 			}
 		}
+		kernel_split_state.rng[ray_index] = rng;
 #endif  /* __EMISSION__ */
 	}
-	return enqueue_flag;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+#ifdef __EMISSION__
+	/* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
new file mode 100644
index 00000000000..47d3c280831
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_do_volume.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_do_volume(KernelGlobals *kg)
+{
+#ifdef __VOLUME__
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+	/* Fetch use_queues_flag. */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          1);
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+
+		bool hit = ! IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
+
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
+		ShaderData *sd = &kernel_split_state.sd[ray_index];
+		ShaderData *sd_input = &kernel_split_state.sd_DL_shadow[ray_index];
+
+		/* Sanitize volume stack. */
+		if(!hit) {
+			kernel_volume_clean_stack(kg, state->volume_stack);
+		}
+		/* volume attenuation, emission, scatter */
+		if(state->volume_stack[0].shader != SHADER_NONE) {
+			Ray volume_ray = *ray;
+			volume_ray.t = (hit)? isect->t: FLT_MAX;
+
+			bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
+
+			{
+				/* integrate along volume segment with distance sampling */
+				VolumeIntegrateResult result = kernel_volume_integrate(
+					kg, state, sd, &volume_ray, L, throughput, &rng, heterogeneous);
+
+#  ifdef __VOLUME_SCATTER__
+				if(result == VOLUME_PATH_SCATTERED) {
+					/* direct lighting */
+					kernel_path_volume_connect_light(kg, &rng, sd, sd_input, *throughput, state, L);
+
+					/* indirect light bounce */
+					if(kernel_path_volume_bounce(kg, &rng, sd, throughput, state, L, ray))
+						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED);
+					else
+						ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER);
+				}
+#  endif
+			}
+		}
+		kernel_split_state.rng[ray_index] = rng;
+	}
+
+#endif
+}
+
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 5d951b972ed..9fc853a84bf 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -14,157 +14,159 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_holdout_emission_blurring_pathtermination_ao kernel.
- * This is the sixth kernel in the ray tracing logic. This is the fifth
- * of the path iteration kernels. This kernel takes care of the logic to process
- * "material of type holdout", indirect primitive emission, bsdf blurring,
- * probabilistic path termination and AO.
+/* This kernel takes care of the logic to process "material of type holdout",
+ * indirect primitive emission, bsdf blurring, probabilistic path termination
+ * and AO.
  *
- * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
- * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
- * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ * This kernels determines the rays for which a shadow_blocked() function
+ * associated with AO should be executed. Those rays for which a
+ * shadow_blocked() function for AO must be executed are marked with flag
+ * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS
  *
  * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
  *
- * The input and output are as follows,
+ * Note on Queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFFER, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                                           |--- PathState_coop
- * PathRadiance_coop ------------------------------------|                                                           |--- throughput_coop
- * Intersection_coop ------------------------------------|                                                           |--- L_transparent_coop
- * PathState_coop ---------------------------------------|                                                           |--- per_sample_output_buffers
- * L_transparent_coop -----------------------------------|                                                           |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                                           |--- ShaderData
- * ray_state --------------------------------------------|                                                           |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                           |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                           |--- AOAlpha_coop
- * kg (globals) -----------------------------------------|                                                           |--- AOBSDF_coop
- * parallel_samples -------------------------------------|                                                           |--- AOLightRay_coop
- * per_sample_output_buffers ----------------------------|                                                           |
- * sw ---------------------------------------------------|                                                           |
- * sh ---------------------------------------------------|                                                           |
- * sx ---------------------------------------------------|                                                           |
- * sy ---------------------------------------------------|                                                           |
- * stride -----------------------------------------------|                                                           |
- * work_array -------------------------------------------|                                                           |
- * queuesize --------------------------------------------|                                                           |
- * start_sample -----------------------------------------|                                                           |
- *
- * Note on Queues :
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
+ *     RAY_REGENERATED rays
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
+ *     flag RAY_SHADOW_RAY_CAST_AO
  */
 ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
         KernelGlobals *kg,
-        ShaderData *sd,                        /* Required throughout the kernel except probabilistic path termination and AO */
-        ccl_global float *per_sample_output_buffers,
-        ccl_global uint *rng_coop,             /* Required for "kernel_write_data_passes" and AO */
-        ccl_global float3 *throughput_coop,    /* Required for handling holdout material and AO */
-        ccl_global float *L_transparent_coop,  /* Required for handling holdout material */
-        PathRadiance *PathRadiance_coop,       /* Required for "kernel_write_data_passes" and indirect primitive emission */
-        ccl_global PathState *PathState_coop,  /* Required throughout the kernel and AO */
-        Intersection *Intersection_coop,       /* Required for indirect primitive emission */
-        ccl_global float3 *AOAlpha_coop,       /* Required for AO */
-        ccl_global float3 *AOBSDF_coop,        /* Required for AO */
-        ccl_global Ray *AOLightRay_coop,       /* Required for AO */
-        int sw, int sh, int sx, int sy, int stride,
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        ccl_global unsigned int *work_array,   /* Denotes the work that each ray belongs to */
-#ifdef __WORK_STEALING__
-        unsigned int start_sample,
-#endif
-        int parallel_samples,                  /* Number of samples to be processed in parallel */
-        int ray_index,
-        char *enqueue_flag,
-        char *enqueue_flag_AO_SHADOW_RAY_CAST)
+        ccl_local_param BackgroundAOLocals *locals)
 {
-#ifdef __WORK_STEALING__
-	unsigned int my_work;
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		locals->queue_atomics_bg = 0;
+		locals->queue_atomics_ao = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	char enqueue_flag = 0;
+	char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif  /* __COMPUTE_DEVICE_GPU__ */
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	int stride = kernel_split_params.stride;
+
+	unsigned int work_index;
 	unsigned int pixel_x;
 	unsigned int pixel_y;
-#endif
+
 	unsigned int tile_x;
 	unsigned int tile_y;
-	int my_sample_tile;
 	unsigned int sample;
 
-	ccl_global RNG *rng = 0x0;
+	RNG rng = kernel_split_state.rng[ray_index];
 	ccl_global PathState *state = 0x0;
 	float3 throughput;
 
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ccl_global float *buffer = kernel_split_params.buffer;
+
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 
-		throughput = throughput_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		rng = &rng_coop[ray_index];
-#ifdef __WORK_STEALING__
-		my_work = work_array[ray_index];
-		sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
-		get_pixel_tile_position(&pixel_x, &pixel_y,
+		throughput = kernel_split_state.throughput[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+
+		work_index = kernel_split_state.work_array[ray_index];
+		sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+		get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
 		                        &tile_x, &tile_y,
-		                        my_work,
-		                        sw, sh, sx, sy,
-		                        parallel_samples,
+		                        work_index,
 		                        ray_index);
-		my_sample_tile = 0;
-#else  /* __WORK_STEALING__ */
-		sample = work_array[ray_index];
-		/* Buffer's stride is "stride"; Find x and y using ray_index. */
-		int tile_index = ray_index / parallel_samples;
-		tile_x = tile_index % sw;
-		tile_y = tile_index / sw;
-		my_sample_tile = ray_index - (tile_index * parallel_samples);
-#endif  /* __WORK_STEALING__ */
-		per_sample_output_buffers +=
-		    (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) *
-		    kernel_data.film.pass_stride;
+
+		buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
+
+#ifdef __SHADOW_TRICKS__
+		if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
+			if (state->flag & PATH_RAY_CAMERA) {
+				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state->catcher_object = sd->object;
+				if(!kernel_data.background.transparent) {
+					PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+					ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+					L->shadow_color = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
+				}
+			}
+		}
+		else {
+			state->flag &= ~PATH_RAY_SHADOW_CATCHER_ONLY;
+		}
+#endif  /* __SHADOW_TRICKS__ */
 
 		/* holdout */
 #ifdef __HOLDOUT__
-		if(((ccl_fetch(sd, flag) & SD_HOLDOUT) ||
-		    (ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK)) &&
+		if(((sd->flag & SD_HOLDOUT) ||
+		    (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
 		   (state->flag & PATH_RAY_CAMERA))
 		{
 			if(kernel_data.background.transparent) {
 				float3 holdout_weight;
-				if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
+				if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
 					holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
 				}
 				else {
 					holdout_weight = shader_holdout_eval(kg, sd);
 				}
 				/* any throughput is ok, should all be identical here */
-				L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+				kernel_split_state.L_transparent[ray_index] += average(holdout_weight*throughput);
 			}
-			if(ccl_fetch(sd, object_flag) & SD_OBJECT_HOLDOUT_MASK) {
+			if(sd->object_flag & SD_OBJECT_HOLDOUT_MASK) {
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-				*enqueue_flag = 1;
+				enqueue_flag = 1;
 			}
 		}
 #endif  /* __HOLDOUT__ */
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		PathRadiance *L = &PathRadiance_coop[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 		/* Holdout mask objects do not write data passes. */
 		kernel_write_data_passes(kg,
-		                         per_sample_output_buffers,
+		                         buffer,
 		                         L,
 		                         sd,
 		                         sample,
@@ -183,12 +185,12 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 #ifdef __EMISSION__
 		/* emission */
-		if(ccl_fetch(sd, flag) & SD_EMISSION) {
+		if(sd->flag & SD_EMISSION) {
 			/* TODO(sergey): is isect.t wrong here for transparent surfaces? */
 			float3 emission = indirect_primitive_emission(
 			        kg,
 			        sd,
-			        Intersection_coop[ray_index].t,
+			        kernel_split_state.isect[ray_index].t,
 			        state->flag,
 			        state->ray_pdf);
 			path_radiance_accum_emission(L, throughput, emission, state->bounce);
@@ -203,18 +205,18 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 
 		if(probability == 0.0f) {
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-			*enqueue_flag = 1;
+			enqueue_flag = 1;
 		}
 
 		if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 			if(probability != 1.0f) {
-				float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+				float terminate = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_TERMINATE);
 				if(terminate >= probability) {
 					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-					*enqueue_flag = 1;
+					enqueue_flag = 1;
 				}
 				else {
-					throughput_coop[ray_index] = throughput/probability;
+					kernel_split_state.throughput[ray_index] = throughput/probability;
 				}
 			}
 		}
@@ -224,37 +226,65 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion ||
-		   (ccl_fetch(sd, flag) & SD_AO))
+		   (sd->flag & SD_AO))
 		{
 			/* todo: solve correlation */
 			float bsdf_u, bsdf_v;
-			path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+			path_state_rng_2D(kg, &rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
 
 			float ao_factor = kernel_data.background.ao_factor;
 			float3 ao_N;
-			AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-			AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+			kernel_split_state.ao_bsdf[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+			kernel_split_state.ao_alpha[ray_index] = shader_bsdf_alpha(kg, sd);
 
 			float3 ao_D;
 			float ao_pdf;
 			sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
 
-			if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+			if(dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
 				Ray _ray;
-				_ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+				_ray.P = ray_offset(sd->P, sd->Ng);
 				_ray.D = ao_D;
 				_ray.t = kernel_data.background.ao_distance;
 #ifdef __OBJECT_MOTION__
-				_ray.time = ccl_fetch(sd, time);
+				_ray.time = sd->time;
 #endif
-				_ray.dP = ccl_fetch(sd, dP);
+				_ray.dP = sd->dP;
 				_ray.dD = differential3_zero();
-				AOLightRay_coop[ray_index] = _ray;
+				kernel_split_state.ao_light_ray[ray_index] = _ray;
 
 				ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
-				*enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+				enqueue_flag_AO_SHADOW_RAY_CAST = 1;
 			}
 		}
 	}
 #endif  /* __AO__ */
+	kernel_split_state.rng[ray_index] = rng;
+
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        &locals->queue_atomics_bg,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#ifdef __AO__
+	/* Enqueue to-shadow-ray-cast rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+	                        enqueue_flag_AO_SHADOW_RAY_CAST,
+	                        kernel_split_params.queue_size,
+	                        &locals->queue_atomics_ao,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+#endif
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
new file mode 100644
index 00000000000..8192528622e
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_background.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_background(KernelGlobals *kg)
+{
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	int ray_index;
+
+	if(kernel_data.integrator.ao_bounces) {
+		ray_index = get_ray_index(kg, thread_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
+
+		if(ray_index != QUEUE_EMPTY_SLOT) {
+			if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+				ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+				if(state->bounce > kernel_data.integrator.ao_bounces) {
+					ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				}
+			}
+		}
+	}
+
+	ray_index = get_ray_index(kg, thread_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+		/* eval background shader if nothing hit */
+		if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+			*L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+			if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+
+		if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+#ifdef __BACKGROUND__
+			/* sample background shader */
+			float3 L_background = indirect_background(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, ray);
+			path_radiance_accum_background(L, state, (*throughput), L_background);
+#endif
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+	}
+
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
new file mode 100644
index 00000000000..a56e85abeb9
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
+{
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index == 0) {
+		/* We will empty both queues in this kernel. */
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+	}
+
+	int ray_index;
+	get_ray_index(kg, thread_index,
+	              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	              kernel_split_state.queue_data,
+	              kernel_split_params.queue_size,
+	              1);
+	ray_index = get_ray_index(kg, thread_index,
+	                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          1);
+
+#ifdef __SUBSURFACE__
+
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+		kernel_path_subsurface_accum_indirect(ss_indirect, L);
+
+		/* Trace indirect subsurface rays by restarting the loop. this uses less
+		 * stack memory than invoking kernel_path_indirect.
+		 */
+		if(ss_indirect->num_rays) {
+			kernel_path_subsurface_setup_indirect(kg,
+			                                      ss_indirect,
+			                                      state,
+			                                      ray,
+			                                      L,
+			                                      throughput);
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+		}
+		else {
+			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+		}
+	}
+
+#endif  /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
index 3bd0e361078..c669d79ddcd 100644
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ b/intern/cycles/kernel/split/kernel_lamp_emission.h
@@ -14,50 +14,49 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_lamp_emission
- * This is the 3rd kernel in the ray-tracing logic. This is the second of the
- * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
- * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
- * and RAY_HIT_BACKGROUND.
+/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
  * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- * The input/output of the kernel is as follows,
- * Throughput_coop ------------------------------------|--- kernel_lamp_emission --|--- PathRadiance_coop
- * Ray_coop -------------------------------------------|                           |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * PathState_coop -------------------------------------|                           |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
- * kg (globals) ---------------------------------------|                           |
- * Intersection_coop ----------------------------------|                           |
- * ray_state ------------------------------------------|                           |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                           |
- * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                           |
- * queuesize ------------------------------------------|                           |
- * use_queues_flag ------------------------------------|                           |
- * sw -------------------------------------------------|                           |
- * sh -------------------------------------------------|                           |
  */
-ccl_device void kernel_lamp_emission(
-        KernelGlobals *kg,
-        ccl_global float3 *throughput_coop,    /* Required for lamp emission */
-        PathRadiance *PathRadiance_coop,       /* Required for lamp emission */
-        ccl_global Ray *Ray_coop,              /* Required for lamp emission */
-        ccl_global PathState *PathState_coop,  /* Required for lamp emission */
-        Intersection *Intersection_coop,       /* Required for lamp emission */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* Used to decide if this kernel should use
-                                                * queues to fetch ray index
-                                                */
-        int ray_index)
+ccl_device void kernel_lamp_emission(KernelGlobals *kg)
 {
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-	   IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+#ifndef __VOLUME__
+	/* We will empty this queue in this kernel. */
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+	}
+#endif
+	/* Fetch use_queues_flag. */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+#ifndef __VOLUME__
+		                          1
+#else
+		                          0
+#endif
+		                          );
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND))
 	{
-		PathRadiance *L = &PathRadiance_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
+		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
 
-		float3 throughput = throughput_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+		float3 throughput = kernel_split_state.throughput[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
 
 #ifdef __LAMP_MIS__
 		if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
@@ -65,7 +64,7 @@ ccl_device void kernel_lamp_emission(
 			Ray light_ray;
 
 			light_ray.P = ray.P - state->ray_t*ray.D;
-			state->ray_t += Intersection_coop[ray_index].t;
+			state->ray_t += kernel_split_state.isect[ray_index].t;
 			light_ray.D = ray.D;
 			light_ray.t = state->ray_t;
 			light_ray.time = ray.time;
@@ -74,10 +73,13 @@ ccl_device void kernel_lamp_emission(
 			/* intersect with lamp */
 			float3 emission;
 
-			if(indirect_lamp_emission(kg, kg->sd_input, state, &light_ray, &emission)) {
+			if(indirect_lamp_emission(kg, &kernel_split_state.sd_DL_shadow[ray_index], state, &light_ray, &emission)) {
 				path_radiance_accum_emission(L, throughput, emission, state->bounce);
 			}
 		}
 #endif  /* __LAMP_MIS__ */
 	}
 }
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
index 816f3a6fbff..1bebc16e25b 100644
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
@@ -14,128 +14,163 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_setup_next_iteration kernel.
- * This is the tenth kernel in the ray tracing logic. This is the ninth
- * of the path iteration kernels. This kernel takes care of setting up
- * Ray for the next iteration of path-iteration and accumulating radiance
- * corresponding to AO and direct-lighting
+/*This kernel takes care of setting up ray for the next iteration of
+ * path-iteration and accumulating radiance corresponding to AO and
+ * direct-lighting
  *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ * Ray state of rays that are terminated in this kernel are changed
+ * to RAY_UPDATE_BUFFER.
  *
- * The input and output are as follows,
+ * Note on queues:
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
+ * and processes only the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and
+ * reach RAY_UPDATE_BUFF state. These rays are enqueued into
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
+ * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
+ * been changed to RAY_UPDATE_BUFF, there is no problem.
  *
- * rng_coop ---------------------------------------------|--- kernel_next_iteration_setup -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * throughput_coop --------------------------------------|                                 |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
- * PathRadiance_coop ------------------------------------|                                 |--- throughput_coop
- * PathState_coop ---------------------------------------|                                 |--- PathRadiance_coop
- * sd ---------------------------------------------------|                                 |--- PathState_coop
- * ray_state --------------------------------------------|                                 |--- ray_state
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                 |--- Ray_coop
- * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                 |--- use_queues_flag
- * Ray_coop ---------------------------------------------|                                 |
- * kg (globals) -----------------------------------------|                                 |
- * LightRay_dl_coop -------------------------------------|
- * ISLamp_coop ------------------------------------------|
- * BSDFEval_coop ----------------------------------------|
- * LightRay_ao_coop -------------------------------------|
- * AOBSDF_coop ------------------------------------------|
- * AOAlpha_coop -----------------------------------------|
- *
- * Note on queues,
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
- * the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
- * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
- * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
- * changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called :
+ * State of queues when this kernel is called:
  * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
  * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
+ *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
  */
-ccl_device char kernel_next_iteration_setup(
-        KernelGlobals *kg,
-        ShaderData *sd,                       /* Required for setting up ray for next iteration */
-        ccl_global uint *rng_coop,            /* Required for setting up ray for next iteration */
-        ccl_global float3 *throughput_coop,   /* Required for setting up ray for next iteration */
-        PathRadiance *PathRadiance_coop,      /* Required for setting up ray for next iteration */
-        ccl_global Ray *Ray_coop,             /* Required for setting up ray for next iteration */
-        ccl_global PathState *PathState_coop, /* Required for setting up ray for next iteration */
-        ccl_global Ray *LightRay_dl_coop,     /* Required for radiance update - direct lighting */
-        ccl_global int *ISLamp_coop,          /* Required for radiance update - direct lighting */
-        ccl_global BsdfEval *BSDFEval_coop,   /* Required for radiance update - direct lighting */
-        ccl_global Ray *LightRay_ao_coop,     /* Required for radiance update - AO */
-        ccl_global float3 *AOBSDF_coop,       /* Required for radiance update - AO */
-        ccl_global float3 *AOAlpha_coop,      /* Required for radiance update - AO */
-        ccl_global char *ray_state,           /* Denotes the state of each ray */
-        ccl_global char *use_queues_flag,     /* flag to decide if scene_intersect kernel should
-                                               * use queues to fetch ray index */
-        int ray_index)
+ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
+                                            ccl_local_param unsigned int *local_queue_atomics)
 {
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
+		/* If we are here, then it means that scene-intersect kernel
+		* has already been executed atleast once. From the next time,
+		* scene-intersect kernel may operate on queues to fetch ray index
+		*/
+		*kernel_split_params.use_queues_flag = 1;
+
+		/* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
+		 * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
+		 * previous kernel.
+		 */
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+		kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+	}
+
 	char enqueue_flag = 0;
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
 
 	/* Load ShaderData structure. */
 	PathRadiance *L = NULL;
 	ccl_global PathState *state = NULL;
+	ccl_global char *ray_state = kernel_split_state.ray_state;
 
 	/* Path radiance update for AO/Direct_lighting's shadow blocked. */
 	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
 	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
 	{
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
-		float3 _throughput = throughput_coop[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+		L = &kernel_split_state.path_radiance[ray_index];
+		float3 _throughput = kernel_split_state.throughput[ray_index];
 
 		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
-			float3 shadow = LightRay_ao_coop[ray_index].P;
-			char update_path_radiance = LightRay_ao_coop[ray_index].t;
+			float3 shadow = kernel_split_state.ao_light_ray[ray_index].P;
+			// TODO(mai): investigate correctness here
+			char update_path_radiance = (char)kernel_split_state.ao_light_ray[ray_index].t;
 			if(update_path_radiance) {
 				path_radiance_accum_ao(L,
 				                       _throughput,
-				                       AOAlpha_coop[ray_index],
-				                       AOBSDF_coop[ray_index],
+				                       kernel_split_state.ao_alpha[ray_index],
+				                       kernel_split_state.ao_bsdf[ray_index],
 				                       shadow,
 				                       state->bounce);
 			}
+			else {
+				path_radiance_accum_total_ao(L, _throughput, kernel_split_state.ao_bsdf[ray_index]);
+			}
 			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
 		}
 
 		if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
-			float3 shadow = LightRay_dl_coop[ray_index].P;
-			char update_path_radiance = LightRay_dl_coop[ray_index].t;
+			float3 shadow = kernel_split_state.light_ray[ray_index].P;
+			// TODO(mai): investigate correctness here
+			char update_path_radiance = (char)kernel_split_state.light_ray[ray_index].t;
+			BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
 			if(update_path_radiance) {
-				BsdfEval L_light = BSDFEval_coop[ray_index];
 				path_radiance_accum_light(L,
 				                          _throughput,
 				                          &L_light,
 				                          shadow,
 				                          1.0f,
 				                          state->bounce,
-				                          ISLamp_coop[ray_index]);
+				                          kernel_split_state.is_lamp[ray_index]);
+			}
+			else {
+				path_radiance_accum_total_light(L, _throughput, &L_light);
 			}
 			REMOVE_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
 		}
 	}
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		ccl_global float3 *throughput = &throughput_coop[ray_index];
-		ccl_global Ray *ray = &Ray_coop[ray_index];
-		ccl_global RNG *rng = &rng_coop[ray_index];
-		state = &PathState_coop[ray_index];
-		L = &PathRadiance_coop[ray_index];
+		ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+		ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		state = &kernel_split_state.path_state[ray_index];
+		L = &kernel_split_state.path_radiance[ray_index];
 
 		/* Compute direct lighting and next bounce. */
-		if(!kernel_path_surface_bounce(kg, rng, sd, throughput, state, L, ray)) {
+		if(!kernel_path_surface_bounce(kg, &rng, &kernel_split_state.sd[ray_index], throughput, state, L, ray)) {
 			ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
 			enqueue_flag = 1;
 		}
+		kernel_split_state.rng[ray_index] = rng;
 	}
 
-	return enqueue_flag;
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
new file mode 100644
index 00000000000..a7ecde7c80d
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ */
+ccl_device void kernel_path_init(KernelGlobals *kg) {
+	int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
+
+	/* This is the first assignment to ray_state;
+	 * So we dont use ASSIGN_RAY_STATE macro.
+	 */
+	kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
+
+	unsigned int my_sample;
+	unsigned int pixel_x;
+	unsigned int pixel_y;
+	unsigned int tile_x;
+	unsigned int tile_y;
+
+	unsigned int work_index = 0;
+	/* Get work. */
+	if(!get_next_work(kg, &work_index, ray_index)) {
+		/* No more work, mark ray as inactive */
+		kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
+
+		return;
+	}
+
+	/* Get the sample associated with the work. */
+	my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
+
+	/* Get pixel and tile position associated with the work. */
+	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
+	                             &tile_x, &tile_y,
+	                             work_index,
+	                             ray_index);
+	kernel_split_state.work_array[ray_index] = work_index;
+
+	ccl_global uint *rng_state = kernel_split_params.rng_state;
+	rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
+
+	ccl_global float *buffer = kernel_split_params.buffer;
+	buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
+
+	RNG rng = kernel_split_state.rng[ray_index];
+
+	/* Initialize random numbers and ray. */
+	kernel_path_trace_setup(kg,
+	                        rng_state,
+	                        my_sample,
+	                        pixel_x, pixel_y,
+	                        &rng,
+	                        &kernel_split_state.ray[ray_index]);
+
+	if(kernel_split_state.ray[ray_index].t != 0.0f) {
+		/* Initialize throughput, L_transparent, Ray, PathState;
+		 * These rays proceed with path-iteration.
+		 */
+		kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+		kernel_split_state.L_transparent[ray_index] = 0.0f;
+		path_radiance_init(&kernel_split_state.path_radiance[ray_index], kernel_data.film.use_light_pass);
+		path_state_init(kg,
+		                &kernel_split_state.sd_DL_shadow[ray_index],
+		                &kernel_split_state.path_state[ray_index],
+		                &rng,
+		                my_sample,
+		                &kernel_split_state.ray[ray_index]);
+#ifdef __SUBSURFACE__
+		kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
+#endif
+
+#ifdef __KERNEL_DEBUG__
+		debug_data_init(&kernel_split_state.debug_data[ray_index]);
+#endif
+	}
+	else {
+		/* These rays do not participate in path-iteration. */
+		float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+		/* Accumulate result in output buffer. */
+		kernel_write_pass_float4(buffer, my_sample, L_rad);
+		path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
+	}
+	kernel_split_state.rng[ray_index] = rng;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
new file mode 100644
index 00000000000..e2e841f36d3
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_queue_enqueue.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* This kernel enqueues rays of different ray state into their
+ * appropriate queues:
+ *
+ * 1. Rays that have been determined to hit the background from the
+ *    "kernel_scene_intersect" kernel are enqueued in
+ *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+ * 2. Rays that have been determined to be actively participating in pat
+ *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * State of queue during other times this kernel is called:
+ * At entry,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
+ *     and RAY_UPDATE_BUFFER rays.
+ * At exit,
+ *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
+ *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
+ *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
+ */
+ccl_device void kernel_queue_enqueue(KernelGlobals *kg,
+                                     ccl_local_param QueueEnqueueLocals *locals)
+{
+	/* We have only 2 cases (Hit/Not-Hit) */
+	int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+
+	if(lidx == 0) {
+		locals->queue_atomics[0] = 0;
+		locals->queue_atomics[1] = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int queue_number = -1;
+
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
+	   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
+	}
+	else if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
+	        IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
+		queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
+	}
+
+	unsigned int my_lqidx;
+	if(queue_number != -1) {
+		my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	if(lidx == 0) {
+		locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] =
+		        get_global_per_queue_offset(QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                                    locals->queue_atomics,
+		                                    kernel_split_params.queue_index);
+		locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] =
+		        get_global_per_queue_offset(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+		                                    locals->queue_atomics,
+		                                    kernel_split_params.queue_index);
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	unsigned int my_gqidx;
+	if(queue_number != -1) {
+		my_gqidx = get_global_queue_index(queue_number,
+		                                  kernel_split_params.queue_size,
+		                                  my_lqidx,
+		                                  locals->queue_atomics);
+		kernel_split_state.queue_data[my_gqidx] = ray_index;
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
index 2388580051f..684760eedee 100644
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ b/intern/cycles/kernel/split/kernel_scene_intersect.h
@@ -14,81 +14,47 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_scene_intersect kernel.
- * This is the second kernel in the ray tracing logic. This is the first
- * of the path iteration kernels. This kernel takes care of scene_intersect function.
+/* This kernel takes care of scene_intersect function.
  *
  * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
  * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes their ray state to RAY_HIT_BACKGROUND.
- *
- * The input and output are as follows,
- *
- * Ray_coop ---------------------------------------|--------- kernel_scene_intersect----------|--- PathState
- * PathState_coop ---------------------------------|                                          |--- Intersection
- * ray_state --------------------------------------|                                          |--- ray_state
- * use_queues_flag --------------------------------|                                          |
- * QueueData(QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                          |
- * kg (globals) -----------------------------------|                                          |
- * rng_coop ---------------------------------------|                                          |
- * sw ---------------------------------------------|                                          |
- * sh ---------------------------------------------|                                          |
- * queuesize --------------------------------------|                                          |
- *
- * Note on Queues :
- * Ideally we would want kernel_scene_intersect to work on queues.
- * But during the very first time, the queues will be empty and hence we perform a direct mapping
- * between ray-index and thread-index; From the next time onward, the queue will be filled and
- * we may start operating on queues.
- *
- * State of queue during the first time this kernel is called :
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.before and after this kernel
- *
- * State of queues during other times this kernel is called :
- * At entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will have a mix of RAY_ACTIVE, RAY_UPDATE_BUFFER and RAY_REGENERATED rays;
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays ;
- * (The rays that are in the state RAY_UPDATE_BUFFER in both the queues are actually the same rays; These
- * are the rays that were in RAY_ACTIVE state during the initial enqueue but on further processing
- * , by different kernels, have turned into RAY_UPDATE_BUFFER rays. Since all kernel, even after fetching from
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS, proceed further based on ray state information, RAY_UPDATE_BUFFER rays
- * being present in QUEUE_ACTIVE_AND_REGENERATED_RAYS does not cause any logical issues)
- * At exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS - All RAY_REGENERATED rays will have been converted to RAY_ACTIVE and
- * Some rays in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue will move to state RAY_HIT_BACKGROUND
- * QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS - no change
+ * This kernel determines the rays that have hit the background and changes
+ * their ray state to RAY_HIT_BACKGROUND.
  */
-
-ccl_device void kernel_scene_intersect(
-        KernelGlobals *kg,
-        ccl_global uint *rng_coop,
-        ccl_global Ray *Ray_coop,              /* Required for scene_intersect */
-        ccl_global PathState *PathState_coop,  /* Required for scene_intersect */
-        Intersection *Intersection_coop,       /* Required for scene_intersect */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int sw, int sh,
-        ccl_global char *use_queues_flag,      /* used to decide if this kernel should use
-                                                * queues to fetch ray index */
-#ifdef __KERNEL_DEBUG__
-        DebugData *debugdata_coop,
-#endif
-        int ray_index)
+ccl_device void kernel_scene_intersect(KernelGlobals *kg)
 {
+	/* Fetch use_queues_flag */
+	char local_use_queues_flag = *kernel_split_params.use_queues_flag;
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(local_use_queues_flag) {
+		ray_index = get_ray_index(kg, ray_index,
+		                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+		                          kernel_split_state.queue_data,
+		                          kernel_split_params.queue_size,
+		                          0);
+
+		if(ray_index == QUEUE_EMPTY_SLOT) {
+			return;
+		}
+	}
+
 	/* All regenerated rays become active here */
-	if(IS_STATE(ray_state, ray_index, RAY_REGENERATED))
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
+	if(IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED))
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
 
-	if(!IS_STATE(ray_state, ray_index, RAY_ACTIVE))
+	if(!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE))
 		return;
 
 #ifdef __KERNEL_DEBUG__
-	DebugData *debug_data = &debugdata_coop[ray_index];
+	DebugData *debug_data = &kernel_split_state.debug_data[ray_index];
 #endif
-	Intersection *isect = &Intersection_coop[ray_index];
-	PathState state = PathState_coop[ray_index];
-	Ray ray = Ray_coop[ray_index];
+	Intersection isect;
+	PathState state = kernel_split_state.path_state[ray_index];
+	Ray ray = kernel_split_state.ray[ray_index];
 
 	/* intersect scene */
 	uint visibility = path_state_ray_visibility(kg, &state);
@@ -96,7 +62,7 @@ ccl_device void kernel_scene_intersect(
 #ifdef __HAIR__
 	float difl = 0.0f, extmax = 0.0f;
 	uint lcg_state = 0;
-	RNG rng = rng_coop[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
 
 	if(kernel_data.bvh.have_curves) {
 		if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
@@ -106,19 +72,25 @@ ccl_device void kernel_scene_intersect(
 		}
 
 		extmax = kernel_data.curve.maximum_width;
-		lcg_state = lcg_state_init(&rng, &state, 0x51633e2d);
+		lcg_state = lcg_state_init(&rng, state.rng_offset, state.sample, 0x51633e2d);
+	}
+
+	if(state.bounce > kernel_data.integrator.ao_bounces) {
+		visibility = PATH_RAY_SHADOW;
+		ray.t = kernel_data.background.ao_distance;
 	}
 
-	bool hit = scene_intersect(kg, ray, visibility, isect, &lcg_state, difl, extmax);
+	bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
 #else
-	bool hit = scene_intersect(kg, ray, visibility, isect, NULL, 0.0f, 0.0f);
+	bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
 #endif
+	kernel_split_state.isect[ray_index] = isect;
 
 #ifdef __KERNEL_DEBUG__
 	if(state.flag & PATH_RAY_CAMERA) {
-		debug_data->num_bvh_traversed_nodes += isect->num_traversed_nodes;
-		debug_data->num_bvh_traversed_instances += isect->num_traversed_instances;
-		debug_data->num_bvh_intersections += isect->num_intersections;
+		debug_data->num_bvh_traversed_nodes += isect.num_traversed_nodes;
+		debug_data->num_bvh_traversed_instances += isect.num_traversed_instances;
+		debug_data->num_bvh_intersections += isect.num_intersections;
 	}
 	debug_data->num_ray_bounces++;
 #endif
@@ -128,6 +100,8 @@ ccl_device void kernel_scene_intersect(
 		 * These rays undergo special processing in the
 		 * background_bufferUpdate kernel.
 		 */
-		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
+		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index cef64bf5f36..0f1696e34a0 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -14,57 +14,58 @@
  * limitations under the License.
  */
 
-#include "kernel_split_common.h"
+CCL_NAMESPACE_BEGIN
 
-/* Note on kernel_shader_eval kernel
- * This kernel is the 5th kernel in the ray tracing logic. This is
- * the 4rd kernel in path iteration. This kernel sets up the ShaderData
- * structure from the values computed by the previous kernels. It also identifies
- * the rays of state RAY_TO_REGENERATE and enqueues them in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+/* This kernel sets up the ShaderData structure from the values computed
+ * by the previous kernels.
  *
- * The input and output of the kernel is as follows,
- * rng_coop -------------------------------------------|--- kernel_shader_eval --|--- sd
- * Ray_coop -------------------------------------------|                         |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * PathState_coop -------------------------------------|                         |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
- * Intersection_coop ----------------------------------|                         |
- * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS)-------|                         |
- * Queue_index(QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)---|                         |
- * ray_state ------------------------------------------|                         |
- * kg (globals) ---------------------------------------|                         |
- * queuesize ------------------------------------------|                         |
- *
- * Note on Queues :
- * This kernel reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
- * only the rays of state RAY_ACTIVE;
- * State of queues when this kernel is called,
- * at entry,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- * at exit,
- * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays
+ * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
+ * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
  */
-ccl_device void kernel_shader_eval(
-        KernelGlobals *kg,
-        ShaderData *sd,                        /* Output ShaderData structure to be filled */
-        ccl_global uint *rng_coop,             /* Required for rbsdf calculation */
-        ccl_global Ray *Ray_coop,              /* Required for setting up shader from ray */
-        ccl_global PathState *PathState_coop,  /* Required for all functions in this kernel */
-        Intersection *Intersection_coop,       /* Required for setting up shader from ray */
-        ccl_global char *ray_state,            /* Denotes the state of each ray */
-        int ray_index)
+ccl_device void kernel_shader_eval(KernelGlobals *kg,
+                                   ccl_local_param unsigned int *local_queue_atomics)
 {
-	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-		Intersection *isect = &Intersection_coop[ray_index];
-		ccl_global uint *rng = &rng_coop[ray_index];
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		Ray ray = Ray_coop[ray_index];
+	/* Enqeueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+	char enqueue_flag = 0;
+	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
+		enqueue_flag = 1;
+	}
+
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+	/* Continue on with shader evaluation. */
+	if((ray_index != QUEUE_EMPTY_SLOT) && IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
+		Intersection isect = kernel_split_state.isect[ray_index];
+		RNG rng = kernel_split_state.rng[ray_index];
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		Ray ray = kernel_split_state.ray[ray_index];
 
 		shader_setup_from_ray(kg,
-		                      sd,
-		                      isect,
+		                      &kernel_split_state.sd[ray_index],
+		                      &isect,
 		                      &ray);
-		float rbsdf = path_state_rng_1D_for_decision(kg, rng, state, PRNG_BSDF);
-		shader_eval_surface(kg, sd, rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+		float rbsdf = path_state_rng_1D_for_decision(kg, &rng, state, PRNG_BSDF);
+		shader_eval_surface(kg, &kernel_split_state.sd[ray_index], &rng, state, rbsdf, state->flag, SHADER_CONTEXT_MAIN);
+		kernel_split_state.rng[ray_index] = rng;
 	}
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked.h b/intern/cycles/kernel/split/kernel_shadow_blocked.h
deleted file mode 100644
index 6153af47f96..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_split_common.h"
-
-/* Note on kernel_shadow_blocked kernel.
- * This is the ninth kernel in the ray tracing logic. This is the eighth
- * of the path iteration kernels. This kernel takes care of "shadow ray cast"
- * logic of the direct lighting and AO  part of ray tracing.
- *
- * The input and output are as follows,
- *
- * PathState_coop ----------------------------------|--- kernel_shadow_blocked --|
- * LightRay_dl_coop --------------------------------|                            |--- LightRay_dl_coop
- * LightRay_ao_coop --------------------------------|                            |--- LightRay_ao_coop
- * ray_state ---------------------------------------|                            |--- ray_state
- * Queue_data(QUEUE_SHADOW_RAY_CAST_AO_RAYS &       |                            |--- Queue_data (QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_AO_RAYS)
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * Queue_index(QUEUE_SHADOW_RAY_CAST_AO_RAYS&
-              QUEUE_SHADOW_RAY_CAST_DL_RAYS) -------|                            |
- * kg (globals) ------------------------------------|                            |
- * queuesize ---------------------------------------|                            |
- *
- * Note on sd_shadow : sd_shadow is neither input nor output to this kernel. sd_shadow is filled and consumed in this kernel itself.
- * Note on queues :
- * The kernel fetches from QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS queues. We will empty
- * these queues this kernel.
- * State of queues when this kernel is called :
- * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
- * before and after this kernel call.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS & QUEUE_SHADOW_RAY_CAST_DL_RAYS will be filled with rays marked with flags RAY_SHADOW_RAY_CAST_AO
- * and RAY_SHADOW_RAY_CAST_DL respectively, during kernel entry.
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty at kernel exit.
- */
-ccl_device void kernel_shadow_blocked(
-        KernelGlobals *kg,
-        ccl_global PathState *PathState_coop,  /* Required for shadow blocked */
-        ccl_global Ray *LightRay_dl_coop,      /* Required for direct lighting's shadow blocked */
-        ccl_global Ray *LightRay_ao_coop,      /* Required for AO's shadow blocked */
-        ccl_global char *ray_state,
-        char shadow_blocked_type,
-        int ray_index)
-{
-	/* Flag determining if we need to update L. */
-	char update_path_radiance = 0;
-
-	if(IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL) ||
-	   IS_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO))
-	{
-		ccl_global PathState *state = &PathState_coop[ray_index];
-		ccl_global Ray *light_ray_dl_global = &LightRay_dl_coop[ray_index];
-		ccl_global Ray *light_ray_ao_global = &LightRay_ao_coop[ray_index];
-
-		ccl_global Ray *light_ray_global =
-		        shadow_blocked_type == RAY_SHADOW_RAY_CAST_AO
-		                ? light_ray_ao_global
-		                : light_ray_dl_global;
-
-		float3 shadow;
-		update_path_radiance = !(shadow_blocked(kg,
-		                                        kg->sd_input,
-		                                        state,
-		                                        light_ray_global,
-		                                        &shadow));
-
-		/* We use light_ray_global's P and t to store shadow and
-		 * update_path_radiance.
-		 */
-		light_ray_global->P = shadow;
-		light_ray_global->t = update_path_radiance;
-	}
-}
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
new file mode 100644
index 00000000000..4243e18de72
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for AO. */
+ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
+{
+	unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < ao_queue_length) {
+		ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS,
+		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	/* Flag determining if we need to update L. */
+	char update_path_radiance = 0;
+
+	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ccl_global Ray *light_ray_global = &kernel_split_state.ao_light_ray[ray_index];
+
+		float3 shadow;
+		Ray ray = *light_ray_global;
+		update_path_radiance = !(shadow_blocked(kg,
+		                                        &kernel_split_state.sd_DL_shadow[ray_index],
+		                                        state,
+		                                        &ray,
+		                                        &shadow));
+
+		*light_ray_global = ray;
+		/* We use light_ray_global's P and t to store shadow and
+		 * update_path_radiance.
+		 */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
new file mode 100644
index 00000000000..bb8f0157965
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* Shadow ray cast for direct visible light. */
+ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
+{
+	unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = QUEUE_EMPTY_SLOT;
+	int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	if(thread_index < dl_queue_length) {
+		ray_index = get_ray_index(kg, thread_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS,
+		                          kernel_split_state.queue_data, kernel_split_params.queue_size, 1);
+	}
+
+	if(ray_index == QUEUE_EMPTY_SLOT)
+		return;
+
+	/* Flag determining if we need to update L. */
+	char update_path_radiance = 0;
+
+	if(IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL)) {
+		ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+		ccl_global Ray *light_ray_global = &kernel_split_state.light_ray[ray_index];
+
+		float3 shadow;
+		Ray ray = *light_ray_global;
+		update_path_radiance = !(shadow_blocked(kg,
+		                                        &kernel_split_state.sd_DL_shadow[ray_index],
+		                                        state,
+		                                        &ray,
+		                                        &shadow));
+
+		*light_ray_global = ray;
+		/* We use light_ray_global's P and t to store shadow and
+		 * update_path_radiance.
+		 */
+		light_ray_global->P = shadow;
+		light_ray_global->t = update_path_radiance;
+	}
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
index 2135ee22b2e..4303ba0a905 100644
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ b/intern/cycles/kernel/split/kernel_split_common.h
@@ -17,48 +17,61 @@
 #ifndef  __KERNEL_SPLIT_H__
 #define  __KERNEL_SPLIT_H__
 
-#include "kernel_compat_opencl.h"
-#include "kernel_math.h"
-#include "kernel_types.h"
-#include "kernel_globals.h"
-#include "kernel_image_opencl.h"
+#include "kernel/kernel_math.h"
+#include "kernel/kernel_types.h"
 
-#include "util_atomic.h"
+#include "kernel/split/kernel_split_data.h"
 
-#include "kernel_random.h"
-#include "kernel_projection.h"
-#include "kernel_montecarlo.h"
-#include "kernel_differential.h"
-#include "kernel_camera.h"
+#include "kernel/kernel_globals.h"
 
-#include "geom/geom.h"
-#include "bvh/bvh.h"
+#ifdef __OSL__
+#  include "kernel/osl/osl_shader.h"
+#endif
+
+#ifdef __KERNEL_OPENCL__
+#  include "kernel/kernel_image_opencl.h"
+#endif
+#ifdef __KERNEL_CPU__
+#  include "kernel/kernels/cpu/kernel_cpu_image.h"
+#endif
+
+#include "util/util_atomic.h"
+
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_camera.h"
+
+#include "kernel/geom/geom.h"
+#include "kernel/bvh/bvh.h"
 
-#include "kernel_accumulate.h"
-#include "kernel_shader.h"
-#include "kernel_light.h"
-#include "kernel_passes.h"
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_shader.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
 
 #ifdef __SUBSURFACE__
-#include "kernel_subsurface.h"
+#  include "kernel/kernel_subsurface.h"
 #endif
 
 #ifdef __VOLUME__
-#include "kernel_volume.h"
+#  include "kernel/kernel_volume.h"
 #endif
 
-#include "kernel_path_state.h"
-#include "kernel_shadow.h"
-#include "kernel_emission.h"
-#include "kernel_path_common.h"
-#include "kernel_path_surface.h"
-#include "kernel_path_volume.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shadow.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_path_common.h"
+#include "kernel/kernel_path_surface.h"
+#include "kernel/kernel_path_volume.h"
+#include "kernel/kernel_path_subsurface.h"
 
 #ifdef __KERNEL_DEBUG__
-#include "kernel_debug.h"
+#  include "kernel/kernel_debug.h"
 #endif
 
-#include "kernel_queues.h"
-#include "kernel_work_stealing.h"
+#include "kernel/kernel_queues.h"
+#include "kernel/kernel_work_stealing.h"
 
 #endif  /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
new file mode 100644
index 00000000000..17e6587883a
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_H__
+#define __KERNEL_SPLIT_DATA_H__
+
+#include "kernel/split/kernel_split_data_types.h"
+#include "kernel/kernel_globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	uint64_t size = 0;
+#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
+	size = size SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+#ifdef __SUBSURFACE__
+	size += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16); /* ss_rays */
+#endif
+
+#ifdef __VOLUME__
+	size += align_up(2 * num_elements * sizeof(PathState), 16); /* state_shadow */
+#endif
+
+	return size;
+}
+
+ccl_device_inline void split_data_init(KernelGlobals *kg,
+                                       ccl_global SplitData *split_data,
+                                       size_t num_elements,
+                                       ccl_global void *data,
+                                       ccl_global char *ray_state)
+{
+	(void)kg;  /* Unused on CPU. */
+
+	ccl_global char *p = (ccl_global char*)data;
+
+#define SPLIT_DATA_ENTRY(type, name, num) \
+	split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16);
+	SPLIT_DATA_ENTRIES;
+#undef SPLIT_DATA_ENTRY
+
+#ifdef __SUBSURFACE__
+	split_data->ss_rays = (ccl_global SubsurfaceIndirectRays*)p;
+	p += align_up(num_elements * sizeof(SubsurfaceIndirectRays), 16);
+#endif
+
+#ifdef __VOLUME__
+	split_data->state_shadow = (ccl_global PathState*)p;
+	p += align_up(2 * num_elements * sizeof(PathState), 16);
+#endif
+
+	split_data->ray_state = ray_state;
+}
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
new file mode 100644
index 00000000000..748197b7183
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
+#define __KERNEL_SPLIT_DATA_TYPES_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* parameters used by the split kernels, we use a single struct to avoid passing these to each kernel */
+
+typedef struct SplitParams {
+	int x;
+	int y;
+	int w;
+	int h;
+
+	int offset;
+	int stride;
+
+	ccl_global uint *rng_state;
+
+	int start_sample;
+	int end_sample;
+
+	ccl_global unsigned int *work_pools;
+	unsigned int num_samples;
+
+	ccl_global int *queue_index;
+	int queue_size;
+	ccl_global char *use_queues_flag;
+
+	ccl_global float *buffer;
+} SplitParams;
+
+/* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+
+/* SPLIT_DATA_ENTRY(type, name, num) */
+
+#if defined(WITH_CYCLES_DEBUG) || defined(__KERNEL_DEBUG__)
+/* DebugData memory */
+#  define SPLIT_DATA_DEBUG_ENTRIES \
+	SPLIT_DATA_ENTRY(DebugData, debug_data, 1)
+#else
+#  define SPLIT_DATA_DEBUG_ENTRIES
+#endif
+
+#define SPLIT_DATA_ENTRIES \
+	SPLIT_DATA_ENTRY(ccl_global RNG, rng, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float, L_transparent, 1) \
+	SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, ao_alpha, 1) \
+	SPLIT_DATA_ENTRY(ccl_global float3, ao_bsdf, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, ao_light_ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
+	SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
+	SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
+	SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
+	SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
+	SPLIT_DATA_DEBUG_ENTRIES \
+
+/* struct that holds pointers to data in the shared state buffer */
+typedef struct SplitData {
+#define SPLIT_DATA_ENTRY(type, name, num) type *name;
+	SPLIT_DATA_ENTRIES
+#undef SPLIT_DATA_ENTRY
+
+#ifdef __SUBSURFACE__
+	ccl_global SubsurfaceIndirectRays *ss_rays;
+#endif
+
+#ifdef __VOLUME__
+	ccl_global PathState *state_shadow;
+#endif
+
+	/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
+	 * the host easily) but is still used the same as the other data so we have it here in this struct as well
+	 */
+	ccl_global char *ray_state;
+} SplitData;
+
+#ifndef __KERNEL_CUDA__
+#  define kernel_split_state (kg->split_data)
+#  define kernel_split_params (kg->split_param_data)
+#else
+__device__ SplitData __split_data;
+#  define kernel_split_state (__split_data)
+__device__ SplitParams __split_param_data;
+#  define kernel_split_params (__split_param_data)
+#endif  /* __KERNEL_CUDA__ */
+
+/* Local storage for queue_enqueue kernel. */
+typedef struct QueueEnqueueLocals {
+	uint queue_atomics[2];
+} QueueEnqueueLocals;
+
+/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
+typedef struct BackgroundAOLocals {
+	uint queue_atomics_bg;
+	uint queue_atomics_ao;
+} BackgroundAOLocals;
+
+CCL_NAMESPACE_END
+
+#endif  /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
new file mode 100644
index 00000000000..0b4d50c70ee
--- /dev/null
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+
+ccl_device void kernel_subsurface_scatter(KernelGlobals *kg,
+                                          ccl_local_param unsigned int* local_queue_atomics)
+{
+#ifdef __SUBSURFACE__
+	if(ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
+		*local_queue_atomics = 0;
+	}
+	ccl_barrier(CCL_LOCAL_MEM_FENCE);
+
+	int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
+	ray_index = get_ray_index(kg, ray_index,
+	                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
+	                          kernel_split_state.queue_data,
+	                          kernel_split_params.queue_size,
+	                          0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+	/* If we are executing on a GPU device, we exit all threads that are not
+	 * required.
+	 *
+	 * If we are executing on a CPU device, then we need to keep all threads
+	 * active since we have barrier() calls later in the kernel. CPU devices,
+	 * expect all threads to execute barrier statement.
+	 */
+	if(ray_index == QUEUE_EMPTY_SLOT) {
+		return;
+	}
+#endif
+
+	char enqueue_flag = 0;
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+
+	ccl_global char *ray_state = kernel_split_state.ray_state;
+	ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
+	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
+	RNG rng = kernel_split_state.rng[ray_index];
+	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
+	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
+	ShaderData *sd = &kernel_split_state.sd[ray_index];
+	ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
+
+	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+		if(sd->flag & SD_BSSRDF) {
+			if(kernel_path_subsurface_scatter(kg,
+			                                  sd,
+			                                  emission_sd,
+			                                  L,
+			                                  state,
+			                                  &rng,
+			                                  ray,
+			                                  throughput,
+			                                  ss_indirect)) {
+				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+				enqueue_flag = 1;
+			}
+		}
+		kernel_split_state.rng[ray_index] = rng;
+	}
+
+#ifndef __COMPUTE_DEVICE_GPU__
+	}
+#endif
+
+	/* Enqueue RAY_UPDATE_BUFFER rays. */
+	enqueue_ray_index_local(ray_index,
+	                        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
+	                        enqueue_flag,
+	                        kernel_split_params.queue_size,
+	                        local_queue_atomics,
+	                        kernel_split_state.queue_data,
+	                        kernel_split_params.queue_index);
+
+#endif  /* __SUBSURFACE__ */
+
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644
index a21e9b6a0b1..00000000000
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "../kernel_compat_opencl.h"
-#include "../kernel_math.h"
-#include "../kernel_types.h"
-#include "../kernel_globals.h"
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-ccl_device void kernel_sum_all_radiance(
-        ccl_constant KernelData *data,               /* To get pass_stride to offet into buffer */
-        ccl_global float *buffer,                    /* Output buffer of RenderTile */
-        ccl_global float *per_sample_output_buffer,  /* Radiance contributed by all samples */
-        int parallel_samples, int sw, int sh, int stride,
-        int buffer_offset_x,
-        int buffer_offset_y,
-        int buffer_stride,
-        int start_sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-
-	if(x < sw && y < sh) {
-		buffer += ((buffer_offset_x + x) + (buffer_offset_y + y) * buffer_stride) * (data->film.pass_stride);
-		per_sample_output_buffer += ((x + y * stride) * parallel_samples) * (data->film.pass_stride);
-
-		int sample_stride = (data->film.pass_stride);
-
-		int sample_iterator = 0;
-		int pass_stride_iterator = 0;
-		int num_floats = data->film.pass_stride;
-
-		for(sample_iterator = 0; sample_iterator < parallel_samples; sample_iterator++) {
-			for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
-				*(buffer + pass_stride_iterator) =
-				        (start_sample == 0 && sample_iterator == 0)
-				                ? *(per_sample_output_buffer + pass_stride_iterator)
-				                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
-			}
-			per_sample_output_buffer += sample_stride;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 88ec7fe6fcc..d748e76fa80 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -39,7 +39,7 @@
  * mostly taken care of in the SVM compiler.
  */
 
-#include "svm_types.h"
+#include "kernel/svm/svm_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -139,49 +139,49 @@ CCL_NAMESPACE_END
 
 /* Nodes */
 
-#include "svm_noise.h"
+#include "kernel/svm/svm_noise.h"
 #include "svm_texture.h"
 
-#include "svm_color_util.h"
-#include "svm_math_util.h"
-
-#include "svm_attribute.h"
-#include "svm_gradient.h"
-#include "svm_blackbody.h"
-#include "svm_closure.h"
-#include "svm_noisetex.h"
-#include "svm_convert.h"
-#include "svm_displace.h"
-#include "svm_fresnel.h"
-#include "svm_wireframe.h"
-#include "svm_wavelength.h"
-#include "svm_camera.h"
-#include "svm_geometry.h"
-#include "svm_hsv.h"
-#include "svm_image.h"
-#include "svm_gamma.h"
-#include "svm_brightness.h"
-#include "svm_invert.h"
-#include "svm_light_path.h"
-#include "svm_magic.h"
-#include "svm_mapping.h"
-#include "svm_normal.h"
-#include "svm_wave.h"
-#include "svm_math.h"
-#include "svm_mix.h"
-#include "svm_ramp.h"
-#include "svm_sepcomb_hsv.h"
-#include "svm_sepcomb_vector.h"
-#include "svm_musgrave.h"
-#include "svm_sky.h"
-#include "svm_tex_coord.h"
-#include "svm_value.h"
-#include "svm_voronoi.h"
-#include "svm_checker.h"
-#include "svm_brick.h"
-#include "svm_vector_transform.h"
-#include "svm_voxel.h"
-#include "svm_bump.h"
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_math_util.h"
+
+#include "kernel/svm/svm_attribute.h"
+#include "kernel/svm/svm_gradient.h"
+#include "kernel/svm/svm_blackbody.h"
+#include "kernel/svm/svm_closure.h"
+#include "kernel/svm/svm_noisetex.h"
+#include "kernel/svm/svm_convert.h"
+#include "kernel/svm/svm_displace.h"
+#include "kernel/svm/svm_fresnel.h"
+#include "kernel/svm/svm_wireframe.h"
+#include "kernel/svm/svm_wavelength.h"
+#include "kernel/svm/svm_camera.h"
+#include "kernel/svm/svm_geometry.h"
+#include "kernel/svm/svm_hsv.h"
+#include "kernel/svm/svm_image.h"
+#include "kernel/svm/svm_gamma.h"
+#include "kernel/svm/svm_brightness.h"
+#include "kernel/svm/svm_invert.h"
+#include "kernel/svm/svm_light_path.h"
+#include "kernel/svm/svm_magic.h"
+#include "kernel/svm/svm_mapping.h"
+#include "kernel/svm/svm_normal.h"
+#include "kernel/svm/svm_wave.h"
+#include "kernel/svm/svm_math.h"
+#include "kernel/svm/svm_mix.h"
+#include "kernel/svm/svm_ramp.h"
+#include "kernel/svm/svm_sepcomb_hsv.h"
+#include "kernel/svm/svm_sepcomb_vector.h"
+#include "kernel/svm/svm_musgrave.h"
+#include "kernel/svm/svm_sky.h"
+#include "kernel/svm/svm_tex_coord.h"
+#include "kernel/svm/svm_value.h"
+#include "kernel/svm/svm_voronoi.h"
+#include "kernel/svm/svm_checker.h"
+#include "kernel/svm/svm_brick.h"
+#include "kernel/svm/svm_vector_transform.h"
+#include "kernel/svm/svm_voxel.h"
+#include "kernel/svm/svm_bump.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -192,7 +192,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_noinline void svm_eval_nodes(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, ShaderType type, int path_flag)
 {
 	float stack[SVM_STACK_SIZE];
-	int offset = ccl_fetch(sd, shader) & SHADER_MASK;
+	int offset = sd->shader & SHADER_MASK;
 
 	while(1) {
 		uint4 node = read_node(kg, &offset);
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 0e55c99ae97..229a3f20421 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -27,7 +27,7 @@ ccl_device AttributeDescriptor svm_node_attr_init(KernelGlobals *kg, ShaderData
 
 	AttributeDescriptor desc;
 
-	if(ccl_fetch(sd, object) != OBJECT_NONE) {
+	if(sd->object != OBJECT_NONE) {
 		desc = find_attribute(kg, sd, node.y);
 		if(desc.offset == ATTR_STD_NOT_FOUND) {
 			desc = attribute_not_found();
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index 04a8c7b64e5..610d9af9e1f 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -21,9 +21,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* save state */
-	stack_store_float3(stack, offset+0, ccl_fetch(sd, P));
-	stack_store_float3(stack, offset+3, ccl_fetch(sd, dP).dx);
-	stack_store_float3(stack, offset+6, ccl_fetch(sd, dP).dy);
+	stack_store_float3(stack, offset+0, sd->P);
+	stack_store_float3(stack, offset+3, sd->dP.dx);
+	stack_store_float3(stack, offset+6, sd->dP.dy);
 
 	/* set state as if undisplaced */
 	const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_POSITION_UNDISPLACED);
@@ -36,18 +36,18 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg, ShaderData *sd, floa
 		object_dir_transform(kg, sd, &dPdx);
 		object_dir_transform(kg, sd, &dPdy);
 
-		ccl_fetch(sd, P) = P;
-		ccl_fetch(sd, dP).dx = dPdx;
-		ccl_fetch(sd, dP).dy = dPdy;
+		sd->P = P;
+		sd->dP.dx = dPdx;
+		sd->dP.dy = dPdy;
 	}
 }
 
 ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg, ShaderData *sd, float *stack, uint offset)
 {
 	/* restore state */
-	ccl_fetch(sd, P) = stack_load_float3(stack, offset+0);
-	ccl_fetch(sd, dP).dx = stack_load_float3(stack, offset+3);
-	ccl_fetch(sd, dP).dy = stack_load_float3(stack, offset+6);
+	sd->P = stack_load_float3(stack, offset+0);
+	sd->dP.dx = stack_load_float3(stack, offset+3);
+	sd->dP.dy = stack_load_float3(stack, offset+6);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 00678a49d70..90249dfd978 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -23,7 +23,7 @@ ccl_device void svm_node_camera(KernelGlobals *kg, ShaderData *sd, float *stack,
 	float3 vector;
 
 	Transform tfm = kernel_data.cam.worldtocamera;
-	vector = transform_point(&tfm, ccl_fetch(sd, P));
+	vector = transform_point(&tfm, sd->P);
 	zdepth = vector.z;
 	distance = len(vector);
 
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index 024d7d6447a..9a3689a94f4 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -25,13 +25,13 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = eta;
-			ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+			sd->flag |= bsdf_refraction_setup(bsdf);
 		}
 		else {
 			bsdf->alpha_y = 0.0f;
 			bsdf->alpha_x = 0.0f;
 			bsdf->ior = 0.0f;
-			ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+			sd->flag |= bsdf_reflection_setup(bsdf);
 		}
 	}
 	else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID) {
@@ -40,9 +40,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+			sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 	}
 	else {
 		bsdf->alpha_x = roughness;
@@ -50,9 +50,9 @@ ccl_device void svm_node_glass_setup(ShaderData *sd, MicrofacetBsdf *bsdf, int t
 		bsdf->ior = eta;
 
 		if(refract)
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 		else
-			ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+			sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 	}
 }
 
@@ -70,7 +70,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 	if(mix_weight == 0.0f)
 		return;
 
-	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): ccl_fetch(sd, N);
+	float3 N = stack_valid(data_node.x)? stack_load_float3(stack, data_node.x): sd->N;
 
 	float param1 = (stack_valid(param1_offset))? stack_load_float(stack, param1_offset): __uint_as_float(node.z);
 	float param2 = (stack_valid(param2_offset))? stack_load_float(stack, param2_offset): __uint_as_float(node.w);
@@ -110,10 +110,10 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				T = rotate_around_axis(T, N, anisotropic_rotation * M_2PI_F);
 
 			/* calculate ior */
-			float ior = (ccl_fetch(sd, flag) & SD_BACKFACING) ? 1.0f / eta : eta;
+			float ior = (sd->flag & SD_BACKFACING) ? 1.0f / eta : eta;
 
 			// calculate fresnel for refraction
-			float cosNO = dot(N, ccl_fetch(sd, I));
+			float cosNO = dot(N, sd->I);
 			float fresnel = fresnel_dielectric_cos(cosNO, ior);
 
 			// calculate weights of the diffuse and specular part
@@ -129,7 +129,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 			// get the additional clearcoat normal and subsurface scattering radius
 			uint4 data_cn_ssr = read_node(kg, offset);
-			float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : ccl_fetch(sd, N);
+			float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ? stack_load_float3(stack, data_cn_ssr.x) : sd->N;
 			float3 subsurface_radius = stack_valid(data_cn_ssr.y) ? stack_load_float3(stack, data_cn_ssr.y) : make_float3(1.0f, 1.0f, 1.0f);
 
 			// get the subsurface color
@@ -137,7 +137,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			float3 subsurface_color = stack_valid(data_subsurface_color.x) ? stack_load_float3(stack, data_subsurface_color.x) :
 				make_float3(__uint_as_float(data_subsurface_color.y), __uint_as_float(data_subsurface_color.z), __uint_as_float(data_subsurface_color.w));
 
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 
 #ifdef __SUBSURFACE__
 			float3 albedo = subsurface_color * subsurface + base_color * (1.0f - subsurface);
@@ -163,7 +163,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bsdf->roughness = roughness;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bsdf_principled_diffuse_setup(bsdf);
+						sd->flag |= bsdf_principled_diffuse_setup(bsdf);
 					}
 				}
 				else if(subsurface > CLOSURE_WEIGHT_CUTOFF && subsurf_sample_weight > CLOSURE_WEIGHT_CUTOFF) {
@@ -186,7 +186,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->roughness = roughness;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
 					}
 
 					bssrdf = bssrdf_alloc(sd, make_float3(0.0f, subsurf_weight.y, 0.0f));
@@ -200,7 +200,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->roughness = roughness;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
 					}
 
 					bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, subsurf_weight.z));
@@ -214,7 +214,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bssrdf->roughness = roughness;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
+						sd->flag |= bssrdf_setup(bssrdf, (ClosureType)CLOSURE_BSSRDF_PRINCIPLED_ID);
 					}
 				}
 			}
@@ -230,7 +230,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->roughness = roughness;
 
 					/* setup bsdf */
-					ccl_fetch(sd, flag) |= bsdf_principled_diffuse_setup(bsdf);
+					sd->flag |= bsdf_principled_diffuse_setup(bsdf);
 				}
 			}
 #endif
@@ -251,7 +251,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->N = N;
 
 					/* setup bsdf */
-					ccl_fetch(sd, flag) |= bsdf_principled_sheen_setup(bsdf);
+					sd->flag |= bsdf_principled_sheen_setup(bsdf);
 				}
 			}
 
@@ -286,9 +286,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 						/* setup bsdf */
 						if(distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID || roughness <= 0.075f) /* use single-scatter GGX */
-							ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_ggx_aniso_fresnel_setup(bsdf);
 						else /* use multi-scatter GGX */
-							ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_multi_ggx_aniso_fresnel_setup(bsdf);
 					}
 				}
 #ifdef __CAUSTICS_TRICKS__
@@ -326,7 +326,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 								bsdf->extra->cspec0 = cspec0;
 
 								/* setup bsdf */
-								ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_fresnel_setup(bsdf);
+								sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf);
 							}
 						}
 
@@ -350,7 +350,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 								bsdf->ior = ior;
 
 								/* setup bsdf */
-								ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+								sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 							}
 						}
 					}
@@ -371,7 +371,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 							bsdf->extra->cspec0 = cspec0;
 
 							/* setup bsdf */
-							ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf);
+							sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf);
 						}
 					}
 				}
@@ -399,7 +399,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 						bsdf->extra->clearcoat = clearcoat;
 
 						/* setup bsdf */
-						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_clearcoat_setup(bsdf);
+						sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf);
 					}
 				}
 #ifdef __CAUSTICS_TRICKS__
@@ -409,7 +409,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			break;
 		}
 		case CLOSURE_BSDF_DIFFUSE_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			OrenNayarBsdf *bsdf = (OrenNayarBsdf*)bsdf_alloc(sd, sizeof(OrenNayarBsdf), weight);
 
 			if(bsdf) {
@@ -418,31 +418,31 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				float roughness = param1;
 
 				if(roughness == 0.0f) {
-					ccl_fetch(sd, flag) |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
+					sd->flag |= bsdf_diffuse_setup((DiffuseBsdf*)bsdf);
 				}
 				else {
 					bsdf->roughness = roughness;
-					ccl_fetch(sd, flag) |= bsdf_oren_nayar_setup(bsdf);
+					sd->flag |= bsdf_oren_nayar_setup(bsdf);
 				}
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSLUCENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			DiffuseBsdf *bsdf = (DiffuseBsdf*)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
-				ccl_fetch(sd, flag) |= bsdf_translucent_setup(bsdf);
+				sd->flag |= bsdf_translucent_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_TRANSPARENT_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 			if(bsdf) {
-				ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+				sd->flag |= bsdf_transparent_setup(bsdf);
 			}
 			break;
 		}
@@ -455,7 +455,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -467,21 +467,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFLECTION_ID)
-					ccl_fetch(sd, flag) |= bsdf_reflection_setup(bsdf);
+					sd->flag |= bsdf_reflection_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ID)
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID) {
 					kernel_assert(stack_valid(data_node.z));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.z);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_setup(bsdf);
+						sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_setup(bsdf);
 			}
 
 			break;
@@ -493,7 +493,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -501,7 +501,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->extra = NULL;
 
 				float eta = fmaxf(param2, 1e-5f);
-				eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				/* setup bsdf */
 				if(type == CLOSURE_BSDF_REFRACTION_ID) {
@@ -509,7 +509,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->alpha_y = 0.0f;
 					bsdf->ior = eta;
 
-					ccl_fetch(sd, flag) |= bsdf_refraction_setup(bsdf);
+					sd->flag |= bsdf_refraction_setup(bsdf);
 				}
 				else {
 					bsdf->alpha_x = param1;
@@ -517,9 +517,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bsdf->ior = eta;
 
 					if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID)
-						ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
 					else
-						ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_refraction_setup(bsdf);
+						sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
 				}
 			}
 
@@ -535,14 +535,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 			}
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 
 			/* index of refraction */
 			float eta = fmaxf(param2, 1e-5f);
-			eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+			eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 			/* fresnel */
-			float cosNO = dot(N, ccl_fetch(sd, I));
+			float cosNO = dot(N, sd->I);
 			float fresnel = fresnel_dielectric_cos(cosNO, eta);
 			float roughness = param1;
 
@@ -581,7 +581,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && !kernel_data.integrator.caustics_refractive && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 			MicrofacetExtra *extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 
@@ -593,13 +593,13 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->alpha_x = param1;
 				bsdf->alpha_y = param1;
 				float eta = fmaxf(param2, 1e-5f);
-				bsdf->ior = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+				bsdf->ior = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
 				kernel_assert(stack_valid(data_node.z));
 				bsdf->extra->color = stack_load_float3(stack, data_node.z);
 
 				/* setup bsdf */
-				ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
+				sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
 			}
 
 			break;
@@ -612,7 +612,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			if(!kernel_data.integrator.caustics_reflective && (path_flag & PATH_RAY_DIFFUSE))
 				break;
 #endif
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			MicrofacetBsdf *bsdf = (MicrofacetBsdf*)bsdf_alloc(sd, sizeof(MicrofacetBsdf), weight);
 
 			if(bsdf) {
@@ -642,33 +642,33 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->ior = 0.0f;
 
 				if(type == CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_beckmann_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID) {
-					ccl_fetch(sd, flag) |= bsdf_microfacet_ggx_aniso_setup(bsdf);
+					sd->flag |= bsdf_microfacet_ggx_aniso_setup(bsdf);
 				}
 				else if(type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID) {
 					kernel_assert(stack_valid(data_node.w));
 					bsdf->extra = (MicrofacetExtra*)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
 					if(bsdf->extra) {
 						bsdf->extra->color = stack_load_float3(stack, data_node.w);
-						ccl_fetch(sd, flag) |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
+						sd->flag |= bsdf_microfacet_multi_ggx_aniso_setup(bsdf);
 					}
 				}
 				else
-					ccl_fetch(sd, flag) |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
+					sd->flag |= bsdf_ashikhmin_shirley_aniso_setup(bsdf);
 			}
 			break;
 		}
 		case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			VelvetBsdf *bsdf = (VelvetBsdf*)bsdf_alloc(sd, sizeof(VelvetBsdf), weight);
 
 			if(bsdf) {
 				bsdf->N = N;
 
 				bsdf->sigma = saturate(param1);
-				ccl_fetch(sd, flag) |= bsdf_ashikhmin_velvet_setup(bsdf);
+				sd->flag |= bsdf_ashikhmin_velvet_setup(bsdf);
 			}
 			break;
 		}
@@ -678,7 +678,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				break;
 #endif
 		case CLOSURE_BSDF_DIFFUSE_TOON_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			ToonBsdf *bsdf = (ToonBsdf*)bsdf_alloc(sd, sizeof(ToonBsdf), weight);
 
 			if(bsdf) {
@@ -687,18 +687,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				bsdf->smooth = param2;
 				
 				if(type == CLOSURE_BSDF_DIFFUSE_TOON_ID)
-					ccl_fetch(sd, flag) |= bsdf_diffuse_toon_setup(bsdf);
+					sd->flag |= bsdf_diffuse_toon_setup(bsdf);
 				else
-					ccl_fetch(sd, flag) |= bsdf_glossy_toon_setup(bsdf);
+					sd->flag |= bsdf_glossy_toon_setup(bsdf);
 			}
 			break;
 		}
 #ifdef __HAIR__
 		case CLOSURE_BSDF_HAIR_REFLECTION_ID:
 		case CLOSURE_BSDF_HAIR_TRANSMISSION_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			
-			if(ccl_fetch(sd, flag) & SD_BACKFACING && ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
+			if(sd->flag & SD_BACKFACING && sd->type & PRIMITIVE_ALL_CURVE) {
 				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
 
 				if(bsdf) {
@@ -708,7 +708,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					 * better figure out a way to skip backfaces from rays
 					 * spawned by transmission from the front */
 					bsdf->weight = make_float3(1.0f, 1.0f, 1.0f);
-					ccl_fetch(sd, flag) |= bsdf_transparent_setup(bsdf);
+					sd->flag |= bsdf_transparent_setup(bsdf);
 				}
 			}
 			else {
@@ -722,18 +722,18 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					if(stack_valid(data_node.y)) {
 						bsdf->T = normalize(stack_load_float3(stack, data_node.y));
 					}
-					else if(!(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)) {
-						bsdf->T = normalize(ccl_fetch(sd, dPdv));
+					else if(!(sd->type & PRIMITIVE_ALL_CURVE)) {
+						bsdf->T = normalize(sd->dPdv);
 						bsdf->offset = 0.0f;
 					}
 					else
-						bsdf->T = normalize(ccl_fetch(sd, dPdu));
+						bsdf->T = normalize(sd->dPdu);
 
 					if(type == CLOSURE_BSDF_HAIR_REFLECTION_ID) {
-						ccl_fetch(sd, flag) |= bsdf_hair_reflection_setup(bsdf);
+						sd->flag |= bsdf_hair_reflection_setup(bsdf);
 					}
 					else {
-						ccl_fetch(sd, flag) |= bsdf_hair_transmission_setup(bsdf);
+						sd->flag |= bsdf_hair_transmission_setup(bsdf);
 					}
 				}
 			}
@@ -746,8 +746,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 		case CLOSURE_BSSRDF_CUBIC_ID:
 		case CLOSURE_BSSRDF_GAUSSIAN_ID:
 		case CLOSURE_BSSRDF_BURLEY_ID: {
-			float3 albedo = ccl_fetch(sd, svm_closure_weight);
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight;
+			float3 albedo = sd->svm_closure_weight;
+			float3 weight = sd->svm_closure_weight * mix_weight;
 			float sample_weight = fabsf(average(weight));
 			
 			/* disable in case of diffuse ancestor, can't see it well then and
@@ -773,7 +773,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.x;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
 				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, weight.y, 0.0f));
@@ -784,7 +784,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.y;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 
 				bssrdf = bssrdf_alloc(sd, make_float3(0.0f, 0.0f, weight.z));
@@ -795,7 +795,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 					bssrdf->albedo = albedo.z;
 					bssrdf->sharpness = sharpness;
 					bssrdf->N = N;
-					ccl_fetch(sd, flag) |= bssrdf_setup(bssrdf, (ClosureType)type);
+					sd->flag |= bssrdf_setup(bssrdf, (ClosureType)type);
 				}
 			}
 
@@ -825,21 +825,21 @@ ccl_device void svm_node_closure_volume(KernelGlobals *kg, ShaderData *sd, float
 
 	switch(type) {
 		case CLOSURE_VOLUME_ABSORPTION_ID: {
-			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - ccl_fetch(sd, svm_closure_weight)) * mix_weight * density;
+			float3 weight = (make_float3(1.0f, 1.0f, 1.0f) - sd->svm_closure_weight) * mix_weight * density;
 			ShaderClosure *sc = closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_NONE_ID, weight);
 
 			if(sc) {
-				ccl_fetch(sd, flag) |= volume_absorption_setup(sc);
+				sd->flag |= volume_absorption_setup(sc);
 			}
 			break;
 		}
 		case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID: {
-			float3 weight = ccl_fetch(sd, svm_closure_weight) * mix_weight * density;
+			float3 weight = sd->svm_closure_weight * mix_weight * density;
 			HenyeyGreensteinVolume *volume = (HenyeyGreensteinVolume*)bsdf_alloc(sd, sizeof(HenyeyGreensteinVolume), weight);
 
 			if(volume) {
 				volume->g = param2; /* g */
-				ccl_fetch(sd, flag) |= volume_henyey_greenstein_setup(volume);
+				sd->flag |= volume_henyey_greenstein_setup(volume);
 			}
 			break;
 		}
@@ -859,12 +859,12 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_EMISSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_EMISSION;
+	sd->flag |= SD_EMISSION;
 }
 
 ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
@@ -877,10 +877,10 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_BACKGROUND_ID, sd->svm_closure_weight);
 }
 
 ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
@@ -893,12 +893,12 @@ ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 nod
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_HOLDOUT_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_HOLDOUT;
+	sd->flag |= SD_HOLDOUT;
 }
 
 ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack, uint4 node)
@@ -911,19 +911,19 @@ ccl_device void svm_node_closure_ambient_occlusion(ShaderData *sd, float *stack,
 		if(mix_weight == 0.0f)
 			return;
 
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight) * mix_weight);
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight * mix_weight);
 	}
 	else
-		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, ccl_fetch(sd, svm_closure_weight));
+		closure_alloc(sd, sizeof(ShaderClosure), CLOSURE_AMBIENT_OCCLUSION_ID, sd->svm_closure_weight);
 
-	ccl_fetch(sd, flag) |= SD_AO;
+	sd->flag |= SD_AO;
 }
 
 /* Closure Nodes */
 
 ccl_device_inline void svm_node_closure_store_weight(ShaderData *sd, float3 weight)
 {
-	ccl_fetch(sd, svm_closure_weight) = weight;
+	sd->svm_closure_weight = weight;
 }
 
 ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint b)
@@ -973,7 +973,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 ccl_device void svm_node_set_normal(KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
 	float3 normal = stack_load_float3(stack, in_direction);
-	ccl_fetch(sd, N) = normal;
+	sd->N = normal;
 	stack_store_float3(stack, out_normal, normal);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 890ab41aaaa..c94fa130af7 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -25,10 +25,10 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	uint normal_offset, distance_offset, invert, use_object_space;
 	decode_node_uchar4(node.y, &normal_offset, &distance_offset, &invert, &use_object_space);
 
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 
-	float3 dPdx = ccl_fetch(sd, dP).dx;
-	float3 dPdy = ccl_fetch(sd, dP).dy;
+	float3 dPdx = sd->dP.dx;
+	float3 dPdy = sd->dP.dy;
 
 	if(use_object_space) {
 		object_inverse_normal_transform(kg, sd, &normal_in);
@@ -80,14 +80,14 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg, ShaderData *sd, flo
 {
 	float d = stack_load_float(stack, fac_offset);
 
-	float3 dP = ccl_fetch(sd, N);
+	float3 dP = sd->N;
 	object_inverse_normal_transform(kg, sd, &dP);
 
 	dP *= d*0.1f; /* todo: get rid of this factor */
 
 	object_dir_transform(kg, sd, &dP);
 
-	ccl_fetch(sd, P) += dP;
+	sd->P += dP;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 23c97d80cb0..3703ec55015 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -23,12 +23,12 @@ ccl_device void svm_node_fresnel(ShaderData *sd, float *stack, uint ior_offset,
 	uint normal_offset, out_offset;
 	decode_node_uchar4(node, &normal_offset, &out_offset, NULL, NULL);
 	float eta = (stack_valid(ior_offset))? stack_load_float(stack, ior_offset): __uint_as_float(ior_value);
-	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = stack_valid(normal_offset)? stack_load_float3(stack, normal_offset): sd->N;
 	
 	eta = fmaxf(eta, 1e-5f);
-	eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f/eta: eta;
+	eta = (sd->flag & SD_BACKFACING)? 1.0f/eta: eta;
 
-	float f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+	float f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 
 	stack_store_float(stack, out_offset, f);
 }
@@ -44,18 +44,18 @@ ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 	decode_node_uchar4(node.w, &type, &normal_offset, &out_offset, NULL);
 
 	float blend = (stack_valid(blend_offset))? stack_load_float(stack, blend_offset): __uint_as_float(blend_value);
-	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): ccl_fetch(sd, N);
+	float3 normal_in = (stack_valid(normal_offset))? stack_load_float3(stack, normal_offset): sd->N;
 
 	float f;
 
 	if(type == NODE_LAYER_WEIGHT_FRESNEL) {
 		float eta = fmaxf(1.0f - blend, 1e-5f);
-		eta = (ccl_fetch(sd, flag) & SD_BACKFACING)? eta: 1.0f/eta;
+		eta = (sd->flag & SD_BACKFACING)? eta: 1.0f/eta;
 
-		f = fresnel_dielectric_cos(dot(ccl_fetch(sd, I), normal_in), eta);
+		f = fresnel_dielectric_cos(dot(sd->I, normal_in), eta);
 	}
 	else {
-		f = fabsf(dot(ccl_fetch(sd, I), normal_in));
+		f = fabsf(dot(sd->I, normal_in));
 
 		if(blend != 0.5f) {
 			blend = clamp(blend, 0.0f, 1.0f-1e-5f);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index 7d512f7ff4d..4a09d9f6653 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -27,15 +27,15 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P); break;
-		case NODE_GEOM_N: data = ccl_fetch(sd, N); break;
+		case NODE_GEOM_P: data = sd->P; break;
+		case NODE_GEOM_N: data = sd->N; break;
 #ifdef __DPDU__
 		case NODE_GEOM_T: data = primitive_tangent(kg, sd); break;
 #endif
-		case NODE_GEOM_I: data = ccl_fetch(sd, I); break;
-		case NODE_GEOM_Ng: data = ccl_fetch(sd, Ng); break;
+		case NODE_GEOM_I: data = sd->I; break;
+		case NODE_GEOM_Ng: data = sd->Ng; break;
 #ifdef __UV__
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u), ccl_fetch(sd, v), 0.0f); break;
+		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
 #endif
 	}
 
@@ -48,8 +48,8 @@ ccl_device void svm_node_geometry_bump_dx(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dx, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dx, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dx; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -65,8 +65,8 @@ ccl_device void svm_node_geometry_bump_dy(KernelGlobals *kg, ShaderData *sd, flo
 	float3 data;
 
 	switch(type) {
-		case NODE_GEOM_P: data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy; break;
-		case NODE_GEOM_uv: data = make_float3(ccl_fetch(sd, u) + ccl_fetch(sd, du).dy, ccl_fetch(sd, v) + ccl_fetch(sd, dv).dy, 0.0f); break;
+		case NODE_GEOM_P: data = sd->P + sd->dP.dy; break;
+		case NODE_GEOM_uv: data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f); break;
 		default: svm_node_geometry(kg, sd, stack, type, out_offset); return;
 	}
 
@@ -87,9 +87,9 @@ ccl_device void svm_node_object_info(KernelGlobals *kg, ShaderData *sd, float *s
 			stack_store_float3(stack, out_offset, object_location(kg, sd));
 			return;
 		}
-		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_INDEX: data = object_pass_id(kg, sd->object); break;
 		case NODE_INFO_MAT_INDEX: data = shader_pass_id(kg, sd); break;
-		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, ccl_fetch(sd, object)); break;
+		case NODE_INFO_OB_RANDOM: data = object_random_number(kg, sd->object); break;
 		default: data = 0.0f; break;
 	}
 
@@ -106,44 +106,44 @@ ccl_device void svm_node_particle_info(KernelGlobals *kg,
 {
 	switch(type) {
 		case NODE_INFO_PAR_INDEX: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_index(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_AGE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_age(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LIFETIME: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_lifetime(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_LOCATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_location(kg, particle_id));
 			break;
 		}
 #if 0	/* XXX float4 currently not supported in SVM stack */
 		case NODE_INFO_PAR_ROTATION: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float4(stack, out_offset, particle_rotation(kg, particle_id));
 			break;
 		}
 #endif
 		case NODE_INFO_PAR_SIZE: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float(stack, out_offset, particle_size(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_velocity(kg, particle_id));
 			break;
 		}
 		case NODE_INFO_PAR_ANGULAR_VELOCITY: {
-			int particle_id = object_particle_id(kg, ccl_fetch(sd, object));
+			int particle_id = object_particle_id(kg, sd->object);
 			stack_store_float3(stack, out_offset, particle_angular_velocity(kg, particle_id));
 			break;
 		}
@@ -165,7 +165,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_INFO_CURVE_IS_STRAND: {
-			data = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) != 0;
+			data = (sd->type & PRIMITIVE_ALL_CURVE) != 0;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}
@@ -177,7 +177,7 @@ ccl_device void svm_node_hair_info(KernelGlobals *kg,
 			break;
 		}
 		/*case NODE_INFO_CURVE_FADE: {
-			data = ccl_fetch(sd, curve_transparency);
+			data = sd->curve_transparency;
 			stack_store_float(stack, out_offset, data);
 			break;
 		}*/
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 2afdf61b476..76acc9253a1 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -144,7 +144,6 @@ ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y,
 		case 86: r = kernel_tex_image_interp(__tex_image_byte4_086, x, y); break;
 		case 87: r = kernel_tex_image_interp(__tex_image_byte4_087, x, y); break;
 		case 88: r = kernel_tex_image_interp(__tex_image_byte4_088, x, y); break;
-		case 89: r = kernel_tex_image_interp(__tex_image_byte4_089, x, y); break;
 		default:
 			kernel_assert(0);
 			return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
@@ -238,9 +237,9 @@ ccl_device void svm_node_tex_image(KernelGlobals *kg, ShaderData *sd, float *sta
 ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
 {
 	/* get object space normal */
-	float3 N = ccl_fetch(sd, N);
+	float3 N = sd->N;
 
-	N = ccl_fetch(sd, N);
+	N = sd->N;
 	object_inverse_normal_transform(kg, sd, &N);
 
 	/* project from direction vector to barycentric coordinates in triangles */
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 04f6f623f18..1492e358608 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -31,8 +31,8 @@ ccl_device void svm_node_light_path(ShaderData *sd, ccl_addr_space PathState *st
 		case NODE_LP_reflection: info = (path_flag & PATH_RAY_REFLECT)? 1.0f: 0.0f; break;
 		case NODE_LP_transmission: info = (path_flag & PATH_RAY_TRANSMIT)? 1.0f: 0.0f; break;
 		case NODE_LP_volume_scatter: info = (path_flag & PATH_RAY_VOLUME_SCATTER)? 1.0f: 0.0f; break;
-		case NODE_LP_backfacing: info = (ccl_fetch(sd, flag) & SD_BACKFACING)? 1.0f: 0.0f; break;
-		case NODE_LP_ray_length: info = ccl_fetch(sd, ray_length); break;
+		case NODE_LP_backfacing: info = (sd->flag & SD_BACKFACING)? 1.0f: 0.0f; break;
+		case NODE_LP_ray_length: info = sd->ray_length; break;
 		case NODE_LP_ray_depth: info = (float)state->bounce; break;
 		case NODE_LP_ray_diffuse: info = (float)state->diffuse_bounce; break;
 		case NODE_LP_ray_glossy: info = (float)state->glossy_bounce; break;
@@ -56,14 +56,14 @@ ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 
 	switch(type) {
 		case NODE_LIGHT_FALLOFF_QUADRATIC: break;
-		case NODE_LIGHT_FALLOFF_LINEAR: strength *= ccl_fetch(sd, ray_length); break;
-		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length); break;
+		case NODE_LIGHT_FALLOFF_LINEAR: strength *= sd->ray_length; break;
+		case NODE_LIGHT_FALLOFF_CONSTANT: strength *= sd->ray_length*sd->ray_length; break;
 	}
 
 	float smooth = stack_load_float(stack, smooth_offset);
 
 	if(smooth > 0.0f) {
-		float squared = ccl_fetch(sd, ray_length)*ccl_fetch(sd, ray_length);
+		float squared = sd->ray_length*sd->ray_length;
 		/* Distant lamps set the ray length to FLT_MAX, which causes squared to overflow. */
 		if(isfinite(squared)) {
 			strength *= squared/(smooth + squared);
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index 01547b60014..a7f15de7325 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -134,32 +134,37 @@ ccl_device float3 svm_math_blackbody_color(float t) {
 		{  6.72595954e-13f, -2.73059993e-08f,  4.24068546e-04f, -7.52204323e-01f },
 	};
 
-	if(t >= 12000.0f)
+	int i;
+	if(t >= 12000.0f) {
 		return make_float3(0.826270103f, 0.994478524f, 1.56626022f);
+	}
+	else if(t >= 6365.0f) {
+		i = 5;
+	}
+	else if(t >= 3315.0f) {
+		i = 4;
+	}
+	else if(t >= 1902.0f) {
+		i = 3;
+	}
+	else if(t >= 1449.0f) {
+		i = 2;
+	}
+	else if(t >= 1167.0f) {
+		i = 1;
+	}
+	else if(t >= 965.0f) {
+		i = 0;
+	}
+	else {
+		/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
+		return make_float3(4.70366907f, 0.0f, 0.0f);
+	}
 
-	/* Define a macro to reduce stack usage for nvcc */
-#define MAKE_BB_RGB(i) make_float3(\
-		rc[i][0] / t + rc[i][1] * t + rc[i][2],\
-		gc[i][0] / t + gc[i][1] * t + gc[i][2],\
-		((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3])
-
-	if(t >= 6365.0f)
-		return MAKE_BB_RGB(5);
-	if(t >= 3315.0f)
-		return MAKE_BB_RGB(4);
-	if(t >= 1902.0f)
-		return MAKE_BB_RGB(3);
-	if(t >= 1449.0f)
-		return MAKE_BB_RGB(2);
-	if(t >= 1167.0f)
-		return MAKE_BB_RGB(1);
-	if(t >= 965.0f)
-		return MAKE_BB_RGB(0);
-
-#undef MAKE_BB_RGB
-
-	/* For 800 <= t < 965 color does not change in OSL implementation, so keep color the same */
-	return make_float3(4.70366907f, 0.0f, 0.0f);
+	const float t_inv = 1.0f / t;
+	return make_float3(rc[i][0] * t_inv + rc[i][1] * t + rc[i][2],
+	                   gc[i][0] * t_inv + gc[i][1] * t + gc[i][2],
+	                   ((bc[i][0] * t + bc[i][1]) * t + bc[i][2]) * t + bc[i][3]);
 }
 
 ccl_device_inline float3 svm_math_gamma_color(float3 color, float gamma)
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 62ff38cf1c5..0347ab7b193 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -18,50 +18,42 @@ CCL_NAMESPACE_BEGIN
 
 /* Noise */
 
-ccl_device_inline void svm_noise(float3 p, float detail, float distortion, float *fac, float3 *color)
-{
-	int hard = 0;
-
-	if(distortion != 0.0f) {
-		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
-
-		r.x = noise(p + offset) * distortion;
-		r.y = noise(p) * distortion;
-		r.z = noise(p - offset) * distortion;
-
-		p += r;
-	}
-
-	*fac = noise_turbulence(p, detail, hard);
-	*color = make_float3(*fac,
-		noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
-		noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
-}
-
 ccl_device void svm_node_tex_noise(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
 {
 	uint co_offset, scale_offset, detail_offset, distortion_offset, fac_offset, color_offset;
 
 	decode_node_uchar4(node.y, &co_offset, &scale_offset, &detail_offset, &distortion_offset);
+	decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
 
 	uint4 node2 = read_node(kg, offset);
 
 	float scale = stack_load_float_default(stack, scale_offset, node2.x);
 	float detail = stack_load_float_default(stack, detail_offset, node2.y);
 	float distortion = stack_load_float_default(stack, distortion_offset, node2.z);
-	float3 co = stack_load_float3(stack, co_offset);
+	float3 p = stack_load_float3(stack, co_offset) * scale;
+	int hard = 0;
 
-	float3 color;
-	float f;
+	if(distortion != 0.0f) {
+		float3 r, offset = make_float3(13.5f, 13.5f, 13.5f);
+
+		r.x = noise(p + offset) * distortion;
+		r.y = noise(p) * distortion;
+		r.z = noise(p - offset) * distortion;
 
-	svm_noise(co*scale, detail, distortion, &f, &color);
+		p += r;
+	}
 
-	decode_node_uchar4(node.z, &color_offset, &fac_offset, NULL, NULL);
+	float f = noise_turbulence(p, detail, hard);
 
-	if(stack_valid(fac_offset))
+	if(stack_valid(fac_offset)) {
 		stack_store_float(stack, fac_offset, f);
-	if(stack_valid(color_offset))
+	}
+	if(stack_valid(color_offset)) {
+		float3 color = make_float3(f,
+			noise_turbulence(make_float3(p.y, p.x, p.z), detail, hard),
+			noise_turbulence(make_float3(p.y, p.z, p.x), detail, hard));
 		stack_store_float3(stack, color_offset, color);
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index c0b01262212..c94327401f5 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -31,9 +31,9 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -48,47 +48,47 @@ ccl_device void svm_node_tex_coord(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P));
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + camera_position(kg));
+				data = transform_point(&tfm, sd->P + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P));
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P));
+				data = camera_world_to_ndc(kg, sd, sd->P);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P);
+			data = sd->P;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -112,9 +112,9 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -129,47 +129,47 @@ ccl_device void svm_node_tex_coord_bump_dx(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dx);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dx + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dx);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dx;
+			data = sd->P + sd->dP.dx;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -196,9 +196,9 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 
 	switch(type) {
 		case NODE_TEXCO_OBJECT: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 			if(node.w == 0) {
-				if(ccl_fetch(sd, object) != OBJECT_NONE) {
+				if(sd->object != OBJECT_NONE) {
 					object_inverse_position_transform(kg, sd, &data);
 				}
 			}
@@ -213,47 +213,47 @@ ccl_device void svm_node_tex_coord_bump_dy(KernelGlobals *kg,
 			break;
 		}
 		case NODE_TEXCO_NORMAL: {
-			data = ccl_fetch(sd, N);
+			data = sd->N;
 			object_inverse_normal_transform(kg, sd, &data);
 			break;
 		}
 		case NODE_TEXCO_CAMERA: {
 			Transform tfm = kernel_data.cam.worldtocamera;
 
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+			if(sd->object != OBJECT_NONE)
+				data = transform_point(&tfm, sd->P + sd->dP.dy);
 			else
-				data = transform_point(&tfm, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy + camera_position(kg));
+				data = transform_point(&tfm, sd->P + sd->dP.dy + camera_position(kg));
 			break;
 		}
 		case NODE_TEXCO_WINDOW: {
-			if((path_flag & PATH_RAY_CAMERA) && ccl_fetch(sd, object) == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, ray_P) + ccl_fetch(sd, ray_dP).dy);
+			if((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE && kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
+				data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
 			else
-				data = camera_world_to_ndc(kg, sd, ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy);
+				data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
 			data.z = 0.0f;
 			break;
 		}
 		case NODE_TEXCO_REFLECTION: {
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
-				data = 2.0f*dot(ccl_fetch(sd, N), ccl_fetch(sd, I))*ccl_fetch(sd, N) - ccl_fetch(sd, I);
+			if(sd->object != OBJECT_NONE)
+				data = 2.0f*dot(sd->N, sd->I)*sd->N - sd->I;
 			else
-				data = ccl_fetch(sd, I);
+				data = sd->I;
 			break;
 		}
 		case NODE_TEXCO_DUPLI_GENERATED: {
-			data = object_dupli_generated(kg, ccl_fetch(sd, object));
+			data = object_dupli_generated(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_DUPLI_UV: {
-			data = object_dupli_uv(kg, ccl_fetch(sd, object));
+			data = object_dupli_uv(kg, sd->object);
 			break;
 		}
 		case NODE_TEXCO_VOLUME_GENERATED: {
-			data = ccl_fetch(sd, P) + ccl_fetch(sd, dP).dy;
+			data = sd->P + sd->dP.dy;
 
 #ifdef __VOLUME__
-			if(ccl_fetch(sd, object) != OBJECT_NONE)
+			if(sd->object != OBJECT_NONE)
 				data = volume_normalized_position(kg, sd, data);
 #endif
 			break;
@@ -274,12 +274,12 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 	float3 color = stack_load_float3(stack, color_offset);
 	color = 2.0f*make_float3(color.x - 0.5f, color.y - 0.5f, color.z - 0.5f);
 
-	bool is_backfacing = (ccl_fetch(sd, flag) & SD_BACKFACING) != 0;
+	bool is_backfacing = (sd->flag & SD_BACKFACING) != 0;
 	float3 N;
 
 	if(space == NODE_NORMAL_MAP_TANGENT) {
 		/* tangent space */
-		if(ccl_fetch(sd, object) == OBJECT_NONE) {
+		if(sd->object == OBJECT_NONE) {
 			stack_store_float3(stack, normal_offset, make_float3(0.0f, 0.0f, 0.0f));
 			return;
 		}
@@ -299,11 +299,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 		float sign = primitive_attribute_float(kg, sd, attr_sign, NULL, NULL);
 		float3 normal;
 
-		if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
+		if(sd->shader & SHADER_SMOOTH_NORMAL) {
 			normal = primitive_attribute_float3(kg, sd, attr_normal, NULL, NULL);
 		}
 		else {
-			normal = ccl_fetch(sd, Ng);
+			normal = sd->Ng;
 
 			/* the normal is already inverted, which is too soon for the math here */
 			if(is_backfacing) {
@@ -345,11 +345,11 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
 
 	if(strength != 1.0f) {
 		strength = max(strength, 0.0f);
-		N = safe_normalize(ccl_fetch(sd, N) + (N - ccl_fetch(sd, N))*strength);
+		N = safe_normalize(sd->N + (N - sd->N)*strength);
 	}
 
 	if(is_zero(N)) {
-		N = ccl_fetch(sd, N);
+		N = sd->N;
 	}
 
 	stack_store_float3(stack, normal_offset, N);
@@ -377,7 +377,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 		float3 generated;
 
 		if(desc.offset == ATTR_STD_NOT_FOUND)
-			generated = ccl_fetch(sd, P);
+			generated = sd->P;
 		else
 			generated = primitive_attribute_float3(kg, sd, desc, NULL, NULL);
 
@@ -390,7 +390,7 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 	}
 
 	object_normal_transform(kg, sd, &tangent);
-	tangent = cross(ccl_fetch(sd, N), normalize(cross(tangent, ccl_fetch(sd, N))));
+	tangent = cross(sd->N, normalize(cross(tangent, sd->N)));
 	stack_store_float3(stack, tangent_offset, tangent);
 }
 
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 4c32130d06d..4e92f27acdb 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -33,7 +33,7 @@ ccl_device void svm_node_vector_transform(KernelGlobals *kg, ShaderData *sd, flo
 	NodeVectorTransformConvertSpace to = (NodeVectorTransformConvertSpace)ito;
 	
 	Transform tfm;
-	bool is_object = (ccl_fetch(sd, object) != OBJECT_NONE);
+	bool is_object = (sd->object != OBJECT_NONE);
 	bool is_direction = (type == NODE_VECTOR_TRANSFORM_TYPE_VECTOR || type == NODE_VECTOR_TRANSFORM_TYPE_NORMAL);
 	
 	/* From world */
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index a8b3604a8a7..9e826c8c23f 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -46,7 +46,7 @@ ccl_device void svm_node_tex_voxel(KernelGlobals *kg,
 #  if defined(__KERNEL_CUDA__)
 #    if __CUDA_ARCH__ >= 300
 	CUtexObject tex = kernel_tex_fetch(__bindless_mapping, id);
-	if(id < 2048) /* TODO(dingto): Make this a variable */
+	if(id < TEX_START_HALF4_CUDA_KEPLER)
 		r = kernel_tex_image_interp_3d_float4(tex, co.x, co.y, co.z);
 	else {
 		float f = kernel_tex_image_interp_3d_float(tex, co.x, co.y, co.z);
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 87e40791333..3c6353c8001 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -41,9 +41,9 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
                                   float3 *P)
 {
 #ifdef __HAIR__
-	if(ccl_fetch(sd, prim) != PRIM_NONE && ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)
+	if(sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
 #else
-	if(ccl_fetch(sd, prim) != PRIM_NONE)
+	if(sd->prim != PRIM_NONE)
 #endif
 	{
 		float3 Co[3];
@@ -52,12 +52,12 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		/* Triangles */
 		int np = 3;
 
-		if(ccl_fetch(sd, type) & PRIMITIVE_TRIANGLE)
-			triangle_vertices(kg, ccl_fetch(sd, prim), Co);
+		if(sd->type & PRIMITIVE_TRIANGLE)
+			triangle_vertices(kg, sd->prim, Co);
 		else
-			motion_triangle_vertices(kg, ccl_fetch(sd, object), ccl_fetch(sd, prim), ccl_fetch(sd, time), Co);
+			motion_triangle_vertices(kg, sd->object, sd->prim, sd->time, Co);
 
-		if(!(ccl_fetch(sd, object_flag) & SD_OBJECT_TRANSFORM_APPLIED)) {
+		if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 			object_position_transform(kg, sd, &Co[0]);
 			object_position_transform(kg, sd, &Co[1]);
 			object_position_transform(kg, sd, &Co[2]);
@@ -66,8 +66,8 @@ ccl_device_inline float wireframe(KernelGlobals *kg,
 		if(pixel_size) {
 			// Project the derivatives of P to the viewing plane defined
 			// by I so we have a measure of how big is a pixel at this point
-			float pixelwidth_x = len(ccl_fetch(sd, dP).dx - dot(ccl_fetch(sd, dP).dx, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
-			float pixelwidth_y = len(ccl_fetch(sd, dP).dy - dot(ccl_fetch(sd, dP).dy, ccl_fetch(sd, I)) * ccl_fetch(sd, I));
+			float pixelwidth_x = len(sd->dP.dx - dot(sd->dP.dx, sd->I) * sd->I);
+			float pixelwidth_y = len(sd->dP.dy - dot(sd->dP.dy, sd->I) * sd->I);
 			// Take the average of both axis' length
 			pixelwidth = (pixelwidth_x + pixelwidth_y) * 0.5f;
 		}
@@ -113,20 +113,20 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg,
 	 * With OpenCL 2.0 it's possible to avoid this change, but for until
 	 * then we'll be living with such an exception.
 	 */
-	float3 P = ccl_fetch(sd, P);
+	float3 P = sd->P;
 	float f = wireframe(kg, sd, size, pixel_size, &P);
 #else
-	float f = wireframe(kg, sd, size, pixel_size, &ccl_fetch(sd, P));
+	float f = wireframe(kg, sd, size, pixel_size, &sd->P);
 #endif
 
 	/* TODO(sergey): Think of faster way to calculate derivatives. */
 	if(bump_offset == NODE_BUMP_OFFSET_DX) {
-		float3 Px = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dx;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(ccl_fetch(sd, dP).dx);
+		float3 Px = sd->P - sd->dP.dx;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Px)) / len(sd->dP.dx);
 	}
 	else if(bump_offset == NODE_BUMP_OFFSET_DY) {
-		float3 Py = ccl_fetch(sd, P) - ccl_fetch(sd, dP).dy;
-		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(ccl_fetch(sd, dP).dy);
+		float3 Py = sd->P - sd->dP.dy;
+		f += (f - wireframe(kg, sd, size, pixel_size, &Py)) / len(sd->dP.dy);
 	}
 
 	if(stack_valid(out_fac))
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index 8eaa9de3874..17ac66644e2 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -1,14 +1,6 @@
 
 set(INC
-	.
-	../device
-	../graph
-	../kernel
-	../kernel/svm
-	../kernel/osl
-	../bvh
-	../subd
-	../util
+	..
 	../../glew-mx
 )
 
diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp
index c0d429a583c..e157a385904 100644
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "image.h"
-#include "mesh.h"
-#include "attribute.h"
+#include "render/image.h"
+#include "render/mesh.h"
+#include "render/attribute.h"
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/attribute.h b/intern/cycles/render/attribute.h
index f4538c76369..a64eb6542d5 100644
--- a/intern/cycles/render/attribute.h
+++ b/intern/cycles/render/attribute.h
@@ -17,12 +17,12 @@
 #ifndef __ATTRIBUTE_H__
 #define __ATTRIBUTE_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_list.h"
-#include "util_param.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_param.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index 8d7d7b847fd..930debe1e33 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "device.h"
-#include "integrator.h"
-#include "graph.h"
-#include "nodes.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "util_foreach.h"
-#include "util_math.h"
-#include "util_types.h"
+#include "render/background.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index 8029c6a9e80..db20b6ebf87 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -17,9 +17,9 @@
 #ifndef __BACKGROUND_H__
 #define __BACKGROUND_H__
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -30,7 +30,7 @@ class Shader;
 
 class Background : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	float ao_factor;
 	float ao_distance;
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index d9a297002c6..c0fcd517390 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "bake.h"
-#include "integrator.h"
+#include "render/bake.h"
+#include "render/integrator.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -171,9 +171,9 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
 		/* needs to be up to data for attribute access */
 		device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-		device->mem_alloc(d_input, MEM_READ_ONLY);
+		device->mem_alloc("bake_input", d_input, MEM_READ_ONLY);
 		device->mem_copy_to(d_input);
-		device->mem_alloc(d_output, MEM_READ_WRITE);
+		device->mem_alloc("bake_output", d_output, MEM_READ_WRITE);
 
 		DeviceTask task(DeviceTask::SHADER);
 		task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 25f5eb3c897..ceb94cfb682 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -17,11 +17,11 @@
 #ifndef __BAKE_H__
 #define __BAKE_H__
 
-#include "device.h"
-#include "scene.h"
+#include "device/device.h"
+#include "render/scene.h"
 
-#include "util_progress.h"
-#include "util_vector.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -73,7 +73,7 @@ public:
 
 	bool need_update;
 
-	int total_pixel_samples;
+	size_t total_pixel_samples;
 
 private:
 	BakeData *m_bake_data;
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index f1692712d61..fe2c2e78926 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -16,17 +16,17 @@
 
 #include <stdlib.h>
 
-#include "buffers.h"
-#include "device.h"
-
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_hash.h"
-#include "util_image.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_time.h"
-#include "util_types.h"
+#include "render/buffers.h"
+#include "device/device.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
+#include "util/util_image.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_time.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -129,13 +129,13 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	
 	/* allocate buffer */
 	buffer.resize(params.width*params.height*params.get_passes_size());
-	device->mem_alloc(buffer, MEM_READ_WRITE);
+	device->mem_alloc("render_buffer", buffer, MEM_READ_WRITE);
 	device->mem_zero(buffer);
 
 	/* allocate rng state */
 	rng_state.resize(params.width, params.height);
 
-	device->mem_alloc(rng_state, MEM_READ_WRITE);
+	device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
 }
 
 bool RenderBuffers::copy_from_device()
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index c9c2a21079a..5c78971678a 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -17,16 +17,16 @@
 #ifndef __BUFFERS_H__
 #define __BUFFERS_H__
 
-#include "device_memory.h"
+#include "device/device_memory.h"
 
-#include "film.h"
+#include "render/film.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_half.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index c8c51ec96d2..83ff8a10618 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "tables.h"
-
-#include "device.h"
-
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_math_cdf.h"
-#include "util_vector.h"
+#include "render/camera.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/tables.h"
+
+#include "device/device.h"
+
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_math_cdf.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 141ef9cccef..dd6b831b347 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -17,13 +17,13 @@
 #ifndef __CAMERA_H__
 #define __CAMERA_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_boundbox.h"
-#include "util_transform.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -39,7 +39,7 @@ class Scene;
 
 class Camera : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	/* Specifies an offset for the shutter's time interval. */
 	enum MotionPosition {
diff --git a/intern/cycles/render/constant_fold.cpp b/intern/cycles/render/constant_fold.cpp
index b7f25663bc3..2569d9eec27 100644
--- a/intern/cycles/render/constant_fold.cpp
+++ b/intern/cycles/render/constant_fold.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "constant_fold.h"
-#include "graph.h"
+#include "render/constant_fold.h"
+#include "render/graph.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/constant_fold.h b/intern/cycles/render/constant_fold.h
index 7962698319f..33f93b8c0ab 100644
--- a/intern/cycles/render/constant_fold.h
+++ b/intern/cycles/render/constant_fold.h
@@ -17,8 +17,8 @@
 #ifndef __CONSTANT_FOLD_H__
 #define __CONSTANT_FOLD_H__
 
-#include "util_types.h"
-#include "svm_types.h"
+#include "util/util_types.h"
+#include "kernel/svm/svm_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/curves.cpp b/intern/cycles/render/curves.cpp
index f671eb19cae..4c085b928fb 100644
--- a/intern/cycles/render/curves.cpp
+++ b/intern/cycles/render/curves.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "curves.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-
-#include "util_foreach.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
+#include "device/device.h"
+#include "render/curves.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/curves.h b/intern/cycles/render/curves.h
index e41967eebf5..8834764bd63 100644
--- a/intern/cycles/render/curves.h
+++ b/intern/cycles/render/curves.h
@@ -17,8 +17,8 @@
 #ifndef __CURVES_H__
 #define __CURVES_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 923252bb375..7809f4345f1 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "device.h"
-#include "film.h"
-#include "integrator.h"
-#include "mesh.h"
-#include "scene.h"
-#include "tables.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_math.h"
-#include "util_math_cdf.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/mesh.h"
+#include "render/scene.h"
+#include "render/tables.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_math.h"
+#include "util/util_math_cdf.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 9fa51c51f52..83c941d5c57 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -17,12 +17,12 @@
 #ifndef __FILM_H__
 #define __FILM_H__
 
-#include "util_string.h"
-#include "util_vector.h"
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,7 +53,7 @@ public:
 
 class Film : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	float exposure;
 	array<Pass> passes;
diff --git a/intern/cycles/render/graph.cpp b/intern/cycles/render/graph.cpp
index 52c94ec2716..8e61daab49b 100644
--- a/intern/cycles/render/graph.cpp
+++ b/intern/cycles/render/graph.cpp
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include "attribute.h"
-#include "graph.h"
-#include "nodes.h"
-#include "shader.h"
-#include "constant_fold.h"
-
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_queue.h"
-#include "util_logging.h"
+#include "render/attribute.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/constant_fold.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_queue.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -195,6 +196,7 @@ bool ShaderNode::equals(const ShaderNode& other)
 ShaderGraph::ShaderGraph()
 {
 	finalized = false;
+	simplified = false;
 	num_node_ids = 0;
 	add(new OutputNode());
 }
@@ -207,6 +209,8 @@ ShaderGraph::~ShaderGraph()
 ShaderNode *ShaderGraph::add(ShaderNode *node)
 {
 	assert(!finalized);
+	simplified = false;
+
 	node->id = num_node_ids++;
 	nodes.push_back(node);
 	return node;
@@ -241,6 +245,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to)
 {
 	assert(!finalized);
 	assert(from && to);
+	simplified = false;
 
 	if(to->link) {
 		fprintf(stderr, "Cycles shader graph connect: input already connected.\n");
@@ -273,6 +278,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to)
 void ShaderGraph::disconnect(ShaderOutput *from)
 {
 	assert(!finalized);
+	simplified = false;
 
 	foreach(ShaderInput *sock, from->links) {
 		sock->link = NULL;
@@ -285,6 +291,7 @@ void ShaderGraph::disconnect(ShaderInput *to)
 {
 	assert(!finalized);
 	assert(to->link);
+	simplified = false;
 
 	ShaderOutput *from = to->link;
 
@@ -294,6 +301,8 @@ void ShaderGraph::disconnect(ShaderInput *to)
 
 void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to)
 {
+	simplified = false;
+
 	/* Copy because disconnect modifies this list */
 	vector<ShaderInput*> outputs = from->links;
 
@@ -310,9 +319,19 @@ void ShaderGraph::relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to)
 	}
 }
 
+void ShaderGraph::simplify(Scene *scene)
+{
+	if(!simplified) {
+		default_inputs(scene->shader_manager->use_osl());
+		clean(scene);
+		refine_bump_nodes();
+
+		simplified = true;
+	}
+}
+
 void ShaderGraph::finalize(Scene *scene,
                            bool do_bump,
-                           bool do_osl,
                            bool do_simplify,
                            bool bump_in_object_space)
 {
@@ -322,9 +341,7 @@ void ShaderGraph::finalize(Scene *scene,
 	 * modified afterwards. */
 
 	if(!finalized) {
-		default_inputs(do_osl);
-		clean(scene);
-		refine_bump_nodes();
+		simplify(scene);
 
 		if(do_bump)
 			bump_from_displacement(bump_in_object_space);
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 780fdf49ca4..09932695d1f 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -17,17 +17,17 @@
 #ifndef __GRAPH_H__
 #define __GRAPH_H__
 
-#include "node.h"
-#include "node_type.h"
+#include "graph/node.h"
+#include "graph/node_type.h"
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "util_list.h"
-#include "util_map.h"
-#include "util_param.h"
-#include "util_set.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_set.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -201,14 +201,14 @@ public:
 /* Node definition utility macros */
 
 #define SHADER_NODE_CLASS(type) \
-	NODE_DECLARE; \
+	NODE_DECLARE \
 	type(); \
 	virtual ShaderNode *clone() const { return new type(*this); } \
 	virtual void compile(SVMCompiler& compiler); \
 	virtual void compile(OSLCompiler& compiler); \
 
 #define SHADER_NODE_NO_CLONE_CLASS(type) \
-	NODE_DECLARE; \
+	NODE_DECLARE \
 	type(); \
 	virtual void compile(SVMCompiler& compiler); \
 	virtual void compile(OSLCompiler& compiler); \
@@ -240,6 +240,7 @@ public:
 	list<ShaderNode*> nodes;
 	size_t num_node_ids;
 	bool finalized;
+	bool simplified;
 
 	ShaderGraph();
 	~ShaderGraph();
@@ -255,9 +256,9 @@ public:
 	void relink(ShaderNode *node, ShaderOutput *from, ShaderOutput *to);
 
 	void remove_proxy_nodes();
+	void simplify(Scene *scene);
 	void finalize(Scene *scene,
 	              bool do_bump = false,
-	              bool do_osl = false,
 	              bool do_simplify = false,
 	              bool bump_in_object_space = false);
 
diff --git a/intern/cycles/render/image.cpp b/intern/cycles/render/image.cpp
index fd8a1262208..a8c4f446bea 100644
--- a/intern/cycles/render/image.cpp
+++ b/intern/cycles/render/image.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "image.h"
-#include "scene.h"
+#include "device/device.h"
+#include "render/image.h"
+#include "render/scene.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_path.h"
-#include "util_progress.h"
-#include "util_texture.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
+#include "util/util_texture.h"
 
 #ifdef WITH_OSL
 #include <OSL/oslexec.h>
@@ -156,6 +156,16 @@ ImageManager::ImageDataType ImageManager::get_image_metadata(const string& filen
 		}
 	}
 
+	/* Perform preliminary checks, with meaningful logging. */
+	if(!path_exists(filename)) {
+		VLOG(1) << "File '" << filename << "' does not exist.";
+		return IMAGE_DATA_TYPE_BYTE4;
+	}
+	if(path_is_directory(filename)) {
+		VLOG(1) << "File '" << filename << "' is a directory, can't use as image.";
+		return IMAGE_DATA_TYPE_BYTE4;
+	}
+
 	ImageInput *in = ImageInput::create(filename);
 
 	if(in) {
@@ -285,9 +295,8 @@ int ImageManager::add_image(const string& filename,
 
 	thread_scoped_lock device_lock(device_mutex);
 
-	/* Do we have a float? */
-	if(type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4)
-		is_float = true;
+	/* Check whether it's a float texture. */
+	is_float = (type == IMAGE_DATA_TYPE_FLOAT || type == IMAGE_DATA_TYPE_FLOAT4);
 
 	/* No single channel and half textures on CUDA (Fermi) and no half on OpenCL, use available slots */
 	if((type == IMAGE_DATA_TYPE_FLOAT ||
@@ -433,6 +442,11 @@ bool ImageManager::file_load_image_generic(Image *img, ImageInput **in, int &wid
 		return false;
 
 	if(!img->builtin_data) {
+		/* NOTE: Error logging is done in meta data acquisition. */
+		if(!path_exists(img->filename) || path_is_directory(img->filename)) {
+			return false;
+		}
+
 		/* load image from file through OIIO */
 		*in = ImageInput::create(img->filename);
 
diff --git a/intern/cycles/render/image.h b/intern/cycles/render/image.h
index 494c74f0cdd..996b5a5b65f 100644
--- a/intern/cycles/render/image.h
+++ b/intern/cycles/render/image.h
@@ -17,13 +17,13 @@
 #ifndef __IMAGE_H__
 #define __IMAGE_H__
 
-#include "device.h"
-#include "device_memory.h"
+#include "device/device.h"
+#include "device/device_memory.h"
 
-#include "util_image.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_image.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index 1ab0f9874f2..a004bb5b856 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "scene.h"
-#include "shader.h"
-#include "sobol.h"
-
-#include "util_foreach.h"
-#include "util_hash.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/sobol.h"
+
+#include "util/util_foreach.h"
+#include "util/util_hash.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 27fff4831e5..9501d7f8416 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -17,9 +17,9 @@
 #ifndef __INTEGRATOR_H__
 #define __INTEGRATOR_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -29,7 +29,7 @@ class Scene;
 
 class Integrator : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	int min_bounce;
 	int max_bounce;
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 2245c861d5a..4886dcd563f 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "device.h"
-#include "integrator.h"
-#include "film.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
-
-#include "util_foreach.h"
-#include "util_progress.h"
-#include "util_logging.h"
+#include "render/background.h"
+#include "device/device.h"
+#include "render/integrator.h"
+#include "render/film.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
+
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -57,9 +57,9 @@ static void shade_background_pixels(Device *device, DeviceScene *dscene, int res
 
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
+	device->mem_alloc("shade_background_pixels_input", d_input, MEM_READ_ONLY);
 	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+	device->mem_alloc("shade_background_pixels_output", d_output, MEM_WRITE_ONLY);
 
 	DeviceTask main_task(DeviceTask::SHADER);
 	main_task.shader_input = d_input.device_pointer;
@@ -486,10 +486,18 @@ static void background_cdf(int start,
                            float2 *cond_cdf)
 {
 	/* Conditional CDFs (rows, U direction). */
+	/* NOTE: It is possible to have some NaN pixels on background
+	 * which will ruin CDF causing wrong shading. We replace such
+	 * pixels with black.
+	 */
 	for(int i = start; i < end; i++) {
 		float sin_theta = sinf(M_PI_F * (i + 0.5f) / res);
 		float3 env_color = (*pixels)[i * res];
 		float ave_luminance = average(env_color);
+		/* TODO(sergey): Consider adding average_safe(). */
+		if(!isfinite(ave_luminance)) {
+			ave_luminance = 0.0f;
+		}
 
 		cond_cdf[i * cdf_count].x = ave_luminance * sin_theta;
 		cond_cdf[i * cdf_count].y = 0.0f;
@@ -497,6 +505,9 @@ static void background_cdf(int start,
 		for(int j = 1; j < res; j++) {
 			env_color = (*pixels)[i * res + j];
 			ave_luminance = average(env_color);
+			if(!isfinite(ave_luminance)) {
+				ave_luminance = 0.0f;
+			}
 
 			cond_cdf[i * cdf_count + j].x = ave_luminance * sin_theta;
 			cond_cdf[i * cdf_count + j].y = cond_cdf[i * cdf_count + j - 1].y + cond_cdf[i * cdf_count + j - 1].x / res;
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index f56530b6490..7e9014eb823 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -17,12 +17,12 @@
 #ifndef __LIGHT_H__
 #define __LIGHT_H__
 
-#include "kernel_types.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/mesh.cpp b/intern/cycles/render/mesh.cpp
index c42b32919d4..a4dc06c4345 100644
--- a/intern/cycles/render/mesh.cpp
+++ b/intern/cycles/render/mesh.cpp
@@ -14,29 +14,29 @@
  * limitations under the License.
  */
 
-#include "bvh.h"
-#include "bvh_build.h"
-
-#include "camera.h"
-#include "curves.h"
-#include "device.h"
-#include "graph.h"
-#include "shader.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "scene.h"
-
-#include "osl_globals.h"
-
-#include "subd_split.h"
-#include "subd_patch_table.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_progress.h"
-#include "util_set.h"
+#include "bvh/bvh.h"
+#include "bvh/bvh_build.h"
+
+#include "render/camera.h"
+#include "render/curves.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/shader.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/scene.h"
+
+#include "kernel/osl/osl_globals.h"
+
+#include "subd/subd_split.h"
+#include "subd/subd_patch_table.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_set.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -1873,9 +1873,14 @@ void MeshManager::device_update_bvh(Device *device, DeviceScene *dscene, Scene *
 		dscene->prim_object.reference((uint*)&pack.prim_object[0], pack.prim_object.size());
 		device->tex_alloc("__prim_object", dscene->prim_object);
 	}
+	if(pack.prim_time.size()) {
+		dscene->prim_time.reference((float2*)&pack.prim_time[0], pack.prim_time.size());
+		device->tex_alloc("__prim_time", dscene->prim_time);
+	}
 
 	dscene->data.bvh.root = pack.root_index;
 	dscene->data.bvh.use_qbvh = scene->params.use_qbvh;
+	dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
 }
 
 void MeshManager::device_update_flags(Device * /*device*/,
@@ -2152,6 +2157,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	device->tex_free(dscene->prim_visibility);
 	device->tex_free(dscene->prim_index);
 	device->tex_free(dscene->prim_object);
+	device->tex_free(dscene->prim_time);
 	device->tex_free(dscene->tri_shader);
 	device->tex_free(dscene->tri_vnormal);
 	device->tex_free(dscene->tri_vindex);
@@ -2173,6 +2179,7 @@ void MeshManager::device_free(Device *device, DeviceScene *dscene)
 	dscene->prim_visibility.clear();
 	dscene->prim_index.clear();
 	dscene->prim_object.clear();
+	dscene->prim_time.clear();
 	dscene->tri_shader.clear();
 	dscene->tri_vnormal.clear();
 	dscene->tri_vindex.clear();
diff --git a/intern/cycles/render/mesh.h b/intern/cycles/render/mesh.h
index 5f33e30eac2..043ce9d0ffc 100644
--- a/intern/cycles/render/mesh.h
+++ b/intern/cycles/render/mesh.h
@@ -17,17 +17,18 @@
 #ifndef __MESH_H__
 #define __MESH_H__
 
-#include "attribute.h"
-#include "node.h"
-#include "shader.h"
-
-#include "util_boundbox.h"
-#include "util_list.h"
-#include "util_map.h"
-#include "util_param.h"
-#include "util_transform.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "graph/node.h"
+
+#include "render/attribute.h"
+#include "render/shader.h"
+
+#include "util/util_boundbox.h"
+#include "util/util_list.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -48,7 +49,7 @@ struct PackedPatchTable;
 
 class Mesh : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	/* Mesh Triangle */
 	struct Triangle {
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index adc5b820298..cf28bb16bb7 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "device/device.h"
 
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "shader.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/shader.h"
 
-#include "util_foreach.h"
-#include "util_progress.h"
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -121,9 +121,9 @@ bool MeshManager::displace(Device *device, DeviceScene *dscene, Scene *scene, Me
 	/* needs to be up to data for attribute access */
 	device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-	device->mem_alloc(d_input, MEM_READ_ONLY);
+	device->mem_alloc("displace_input", d_input, MEM_READ_ONLY);
 	device->mem_copy_to(d_input);
-	device->mem_alloc(d_output, MEM_WRITE_ONLY);
+	device->mem_alloc("displace_output", d_output, MEM_WRITE_ONLY);
 
 	DeviceTask task(DeviceTask::SHADER);
 	task.shader_input = d_input.device_pointer;
diff --git a/intern/cycles/render/mesh_subdivision.cpp b/intern/cycles/render/mesh_subdivision.cpp
index 57c76a9f1c8..585ed77b026 100644
--- a/intern/cycles/render/mesh_subdivision.cpp
+++ b/intern/cycles/render/mesh_subdivision.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "mesh.h"
-#include "attribute.h"
-#include "camera.h"
+#include "render/mesh.h"
+#include "render/attribute.h"
+#include "render/camera.h"
 
-#include "subd_split.h"
-#include "subd_patch.h"
-#include "subd_patch_table.h"
+#include "subd/subd_split.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_patch_table.h"
 
-#include "util_foreach.h"
-#include "util_algorithm.h"
+#include "util/util_foreach.h"
+#include "util/util_algorithm.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index c02c1adb989..3f56690d0c1 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#include "image.h"
-#include "integrator.h"
-#include "nodes.h"
-#include "scene.h"
-#include "svm.h"
-#include "svm_color_util.h"
-#include "svm_ramp_util.h"
-#include "svm_math_util.h"
-#include "osl.h"
-#include "constant_fold.h"
-
-#include "util_sky_model.h"
-#include "util_foreach.h"
-#include "util_transform.h"
+#include "render/image.h"
+#include "render/integrator.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/svm.h"
+#include "kernel/svm/svm_color_util.h"
+#include "kernel/svm/svm_ramp_util.h"
+#include "kernel/svm/svm_math_util.h"
+#include "render/osl.h"
+#include "render/constant_fold.h"
+
+#include "util/util_sky_model.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_transform.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -1931,21 +1932,38 @@ GlossyBsdfNode::GlossyBsdfNode()
 void GlossyBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp glossy BSDF.";
 			distribution = CLOSURE_BSDF_REFLECTION_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_REFLECTION_ID)
+		{
+			VLOG(1) << "Using GGX glossy with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -1953,7 +1971,8 @@ void GlossyBsdfNode::simplify_settings(Scene *scene)
 bool GlossyBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_REFLECTION_ID || roughness <= 1e-4f);
 }
 
 void GlossyBsdfNode::compile(SVMCompiler& compiler)
@@ -2008,21 +2027,38 @@ GlassBsdfNode::GlassBsdfNode()
 void GlassBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp glass BSDF.";
 			distribution = CLOSURE_BSDF_SHARP_GLASS_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_SHARP_GLASS_ID)
+		{
+			VLOG(1) << "Using GGX glass with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -2030,7 +2066,8 @@ void GlassBsdfNode::simplify_settings(Scene *scene)
 bool GlassBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_SHARP_GLASS_ID || roughness <= 1e-4f);
 }
 
 void GlassBsdfNode::compile(SVMCompiler& compiler)
@@ -2085,21 +2122,38 @@ RefractionBsdfNode::RefractionBsdfNode()
 void RefractionBsdfNode::simplify_settings(Scene *scene)
 {
 	if(distribution_orig == NBUILTIN_CLOSURES) {
+		roughness_orig = roughness;
 		distribution_orig = distribution;
 	}
+	else {
+		/* By default we use original values, so we don't worry about restoring
+		 * defaults later one and can only do override when needed.
+		 */
+		roughness = roughness_orig;
+		distribution = distribution_orig;
+	}
 	Integrator *integrator = scene->integrator;
+	ShaderInput *roughness_input = input("Roughness");
 	if(integrator->filter_glossy == 0.0f) {
 		/* Fallback to Sharp closure for Roughness close to 0.
 		 * Note: Keep the epsilon in sync with kernel!
 		 */
-		ShaderInput *roughness_input = input("Roughness");
 		if(!roughness_input->link && roughness <= 1e-4f) {
+			VLOG(1) << "Using sharp refraction BSDF.";
 			distribution = CLOSURE_BSDF_REFRACTION_ID;
 		}
 	}
 	else {
-		/* Rollback to original distribution when filter glossy is used. */
-		distribution = distribution_orig;
+		/* If filter glossy is used we replace Sharp glossy with GGX so we can
+		 * benefit from closure blur to remove unwanted noise.
+		 */
+		if(roughness_input->link == NULL &&
+		   distribution == CLOSURE_BSDF_REFRACTION_ID)
+		{
+			VLOG(1) << "Using GGX refraction with filter glossy.";
+			distribution = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
+			roughness = 0.0f;
+		}
 	}
 	closure = distribution;
 }
@@ -2107,7 +2161,8 @@ void RefractionBsdfNode::simplify_settings(Scene *scene)
 bool RefractionBsdfNode::has_integrator_dependency()
 {
 	ShaderInput *roughness_input = input("Roughness");
-	return !roughness_input->link && roughness <= 1e-4f;
+	return !roughness_input->link &&
+	       (distribution == CLOSURE_BSDF_REFRACTION_ID || roughness <= 1e-4f);
 }
 
 void RefractionBsdfNode::compile(SVMCompiler& compiler)
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 8d2df673688..d8023747860 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -17,10 +17,10 @@
 #ifndef __NODES_H__
 #define __NODES_H__
 
-#include "graph.h"
-#include "node.h"
+#include "render/graph.h"
+#include "graph/node.h"
 
-#include "util_string.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -324,7 +324,7 @@ private:
 class BsdfNode : public ShaderNode {
 public:
 	explicit BsdfNode(const NodeType *node_type);
-	SHADER_NODE_BASE_CLASS(BsdfNode);
+	SHADER_NODE_BASE_CLASS(BsdfNode)
 
 	bool has_spatial_varying() { return true; }
 	void compile(SVMCompiler& compiler, ShaderInput *param1, ShaderInput *param2, ShaderInput *param3 = NULL, ShaderInput *param4 = NULL);
@@ -421,7 +421,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness;
+	float roughness, roughness_orig;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -433,7 +433,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness, IOR;
+	float roughness, roughness_orig, IOR;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -445,7 +445,7 @@ public:
 	bool has_integrator_dependency();
 	ClosureType get_closure_type() { return distribution; }
 
-	float roughness, IOR;
+	float roughness, roughness_orig, IOR;
 	ClosureType distribution, distribution_orig;
 };
 
@@ -674,7 +674,7 @@ public:
 
 class MixClosureWeightNode : public ShaderNode {
 public:
-	SHADER_NODE_CLASS(MixClosureWeightNode);
+	SHADER_NODE_CLASS(MixClosureWeightNode)
 
 	float weight;
 	float fac;
@@ -920,7 +920,7 @@ public:
 class CurvesNode : public ShaderNode {
 public:
 	explicit CurvesNode(const NodeType *node_type);
-	SHADER_NODE_BASE_CLASS(CurvesNode);
+	SHADER_NODE_BASE_CLASS(CurvesNode)
 
 	virtual int get_group() { return NODE_GROUP_LEVEL_3; }
 
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index 8342f376836..375abfeb27a 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -14,22 +14,22 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "device.h"
-#include "light.h"
-#include "mesh.h"
-#include "curves.h"
-#include "object.h"
-#include "particles.h"
-#include "scene.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
-
-#include "subd_patch_table.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/curves.h"
+#include "render/object.h"
+#include "render/particles.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
+
+#include "subd/subd_patch_table.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -49,6 +49,8 @@ NODE_DEFINE(Object)
 	SOCKET_POINT(dupli_generated, "Dupli Generated", make_float3(0.0f, 0.0f, 0.0f));
 	SOCKET_POINT2(dupli_uv, "Dupli UV", make_float2(0.0f, 0.0f));
 
+	SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", false);
+
 	return type;
 }
 
@@ -597,6 +599,12 @@ void ObjectManager::device_update_flags(Device *device,
 		else {
 			object_flag[object_index] &= ~SD_OBJECT_HAS_VOLUME;
 		}
+		if(object->is_shadow_catcher) {
+			object_flag[object_index] |= SD_OBJECT_SHADOW_CATCHER;
+		}
+		else {
+			object_flag[object_index] &= ~SD_OBJECT_SHADOW_CATCHER;
+		}
 
 		if(bounds_valid) {
 			foreach(Object *volume_object, volume_objects) {
diff --git a/intern/cycles/render/object.h b/intern/cycles/render/object.h
index 7e306fab2a8..12d7b2c81cf 100644
--- a/intern/cycles/render/object.h
+++ b/intern/cycles/render/object.h
@@ -17,14 +17,14 @@
 #ifndef __OBJECT_H__
 #define __OBJECT_H__
 
-#include "node.h"
-#include "scene.h"
+#include "graph/node.h"
+#include "render/scene.h"
 
-#include "util_boundbox.h"
-#include "util_param.h"
-#include "util_transform.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_param.h"
+#include "util/util_transform.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -40,7 +40,7 @@ struct Transform;
 
 class Object : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	Mesh *mesh;
 	Transform tfm;
@@ -53,6 +53,7 @@ public:
 	bool use_motion;
 	bool hide_on_missing_motion;
 	bool use_holdout;
+	bool is_shadow_catcher;
 
 	float3 dupli_generated;
 	float2 dupli_uv;
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 67b68e63cb2..6bff29d1c76 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -14,26 +14,26 @@
  * limitations under the License.
  */
 
-#include "device.h"
+#include "device/device.h"
 
-#include "graph.h"
-#include "light.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-#include "nodes.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/nodes.h"
 
 #ifdef WITH_OSL
 
-#include "osl_globals.h"
-#include "osl_services.h"
-#include "osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+#include "kernel/osl/osl_services.h"
+#include "kernel/osl/osl_shader.h"
 
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_progress.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_progress.h"
 
 #endif
 
@@ -1096,12 +1096,10 @@ void OSLCompiler::compile(Scene *scene, OSLGlobals *og, Shader *shader)
 		/* finalize */
 		shader->graph->finalize(scene,
 		                        false,
-		                        true,
 		                        shader->has_integrator_dependency);
 		if(shader->graph_bump) {
 			shader->graph_bump->finalize(scene,
 			                             true,
-			                             true,
 			                             shader->has_integrator_dependency,
 			                             shader->displacement_method == DISPLACE_BOTH);
 		}
diff --git a/intern/cycles/render/osl.h b/intern/cycles/render/osl.h
index b131b672b8c..2be1126fdd3 100644
--- a/intern/cycles/render/osl.h
+++ b/intern/cycles/render/osl.h
@@ -17,13 +17,13 @@
 #ifndef __OSL_H__
 #define __OSL_H__
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_thread.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
 
-#include "graph.h"
-#include "nodes.h"
-#include "shader.h"
+#include "render/graph.h"
+#include "render/nodes.h"
+#include "render/shader.h"
 
 #ifdef WITH_OSL
 #include <OSL/oslcomp.h>
diff --git a/intern/cycles/render/particles.cpp b/intern/cycles/render/particles.cpp
index 1a35d60fb4b..a51822a08be 100644
--- a/intern/cycles/render/particles.cpp
+++ b/intern/cycles/render/particles.cpp
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "particles.h"
-#include "scene.h"
-
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_map.h"
-#include "util_progress.h"
-#include "util_vector.h"
+#include "device/device.h"
+#include "render/particles.h"
+#include "render/scene.h"
+
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_progress.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/particles.h b/intern/cycles/render/particles.h
index 2509e27b44b..66d46114b3e 100644
--- a/intern/cycles/render/particles.h
+++ b/intern/cycles/render/particles.h
@@ -17,8 +17,8 @@
 #ifndef __PARTICLES_H__
 #define __PARTICLES_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index 68124e78cb5..4db20338744 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -16,27 +16,27 @@
 
 #include <stdlib.h>
 
-#include "background.h"
-#include "bake.h"
-#include "camera.h"
-#include "curves.h"
-#include "device.h"
-#include "film.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "object.h"
-#include "osl.h"
-#include "particles.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-#include "tables.h"
-
-#include "util_foreach.h"
-#include "util_guarded_allocator.h"
-#include "util_logging.h"
-#include "util_progress.h"
+#include "render/background.h"
+#include "render/bake.h"
+#include "render/camera.h"
+#include "render/curves.h"
+#include "device/device.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/particles.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+#include "render/tables.h"
+
+#include "util/util_foreach.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 8768682043f..2b5267642a2 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -17,18 +17,18 @@
 #ifndef __SCENE_H__
 #define __SCENE_H__
 
-#include "image.h"
-#include "shader.h"
+#include "render/image.h"
+#include "render/shader.h"
 
-#include "device_memory.h"
+#include "device/device_memory.h"
 
-#include "util_param.h"
-#include "util_string.h"
-#include "util_system.h"
-#include "util_texture.h"
-#include "util_thread.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
+#include "util/util_texture.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -69,6 +69,7 @@ public:
 	device_vector<uint> prim_visibility;
 	device_vector<uint> prim_index;
 	device_vector<uint> prim_object;
+	device_vector<float2> prim_time;
 
 	/* mesh */
 	device_vector<uint> tri_shader;
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 7c01934cfd8..c9b5547b407 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -17,24 +17,24 @@
 #include <string.h>
 #include <limits.h>
 
-#include "buffers.h"
-#include "camera.h"
-#include "device.h"
-#include "graph.h"
-#include "integrator.h"
-#include "mesh.h"
-#include "object.h"
-#include "scene.h"
-#include "session.h"
-#include "bake.h"
-
-#include "util_foreach.h"
-#include "util_function.h"
-#include "util_logging.h"
-#include "util_math.h"
-#include "util_opengl.h"
-#include "util_task.h"
-#include "util_time.h"
+#include "render/buffers.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/mesh.h"
+#include "render/object.h"
+#include "render/scene.h"
+#include "render/session.h"
+#include "render/bake.h"
+
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_opengl.h"
+#include "util/util_task.h"
+#include "util/util_time.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -230,7 +230,9 @@ void Session::run_gpu()
 				while(1) {
 					scoped_timer pause_timer;
 					pause_cond.wait(pause_lock);
-					progress.add_skip_time(pause_timer, params.background);
+					if(pause) {
+						progress.add_skip_time(pause_timer, params.background);
+					}
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -520,7 +522,9 @@ void Session::run_cpu()
 				while(1) {
 					scoped_timer pause_timer;
 					pause_cond.wait(pause_lock);
-					progress.add_skip_time(pause_timer, params.background);
+					if(pause) {
+						progress.add_skip_time(pause_timer, params.background);
+					}
 
 					update_status_time(pause, no_tiles);
 					progress.set_update();
@@ -633,6 +637,9 @@ DeviceRequestedFeatures Session::get_requested_device_features()
 			requested_features.use_patch_evaluation = true;
 		}
 #endif
+		if(object->is_shadow_catcher) {
+			requested_features.use_shadow_tricks = true;
+		}
 	}
 
 	BakeManager *bake_manager = scene->bake_manager;
@@ -650,6 +657,8 @@ void Session::load_kernels()
 	if(!kernels_loaded) {
 		progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
+		scoped_timer timer;
+
 		DeviceRequestedFeatures requested_features = get_requested_device_features();
 		VLOG(2) << "Requested features:\n" << requested_features;
 		if(!device->load_kernels(requested_features)) {
@@ -663,6 +672,9 @@ void Session::load_kernels()
 			return;
 		}
 
+		progress.add_skip_time(timer, false);
+		VLOG(1) << "Total time spent loading kernels: " << time_dt() - timer.get_start();
+
 		kernels_loaded = true;
 	}
 }
@@ -824,7 +836,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 	int progressive_sample = tile_manager.state.sample;
 	int num_samples = tile_manager.get_num_effective_samples();
 
-	int tile = tile_manager.state.num_rendered_tiles;
+	int tile = progress.get_finished_tiles();
 	int num_tiles = tile_manager.state.num_tiles;
 
 	/* update status */
@@ -832,7 +844,7 @@ void Session::update_status_time(bool show_pause, bool show_done)
 
 	if(!params.progressive) {
 		const bool is_cpu = params.device.type == DEVICE_CPU;
-		const bool is_last_tile = (progress.get_finished_tiles() + 1) == num_tiles;
+		const bool is_last_tile = (tile + 1) == num_tiles;
 
 		substatus = string_printf("Path Tracing Tile %d/%d", tile, num_tiles);
 
@@ -883,6 +895,7 @@ void Session::path_trace()
 	task.need_finish_queue = params.progressive_refine;
 	task.integrator_branched = scene->integrator->method == Integrator::BRANCHED_PATH;
 	task.requested_tile_size = params.tile_size;
+	task.passes_size = tile_manager.params.get_passes_size();
 
 	device->task_add(task);
 }
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index c7ff1446171..a7e5f78a64d 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -17,15 +17,15 @@
 #ifndef __SESSION_H__
 #define __SESSION_H__
 
-#include "buffers.h"
-#include "device.h"
-#include "shader.h"
-#include "tile.h"
-
-#include "util_progress.h"
-#include "util_stats.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "render/buffers.h"
+#include "device/device.h"
+#include "render/shader.h"
+#include "render/tile.h"
+
+#include "util/util_progress.h"
+#include "util/util_stats.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 335edcbe609..12d3c6cf832 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -14,22 +14,22 @@
  * limitations under the License.
  */
 
-#include "background.h"
-#include "camera.h"
-#include "device.h"
-#include "graph.h"
-#include "integrator.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "object.h"
-#include "osl.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-#include "tables.h"
-
-#include "util_foreach.h"
+#include "render/background.h"
+#include "render/camera.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/integrator.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/object.h"
+#include "render/osl.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+#include "render/tables.h"
+
+#include "util/util_foreach.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index 7d896652196..87fef19c592 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -19,20 +19,20 @@
 
 #ifdef WITH_OSL
 /* So no context pollution happens from indirectly included windows.h */
-#  include "util_windows.h"
+#  include "util/util_windows.h"
 #  include <OSL/oslexec.h>
 #endif
 
-#include "attribute.h"
-#include "kernel_types.h"
+#include "render/attribute.h"
+#include "kernel/kernel_types.h"
 
-#include "node.h"
+#include "graph/node.h"
 
-#include "util_map.h"
-#include "util_param.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_types.h"
+#include "util/util_map.h"
+#include "util/util_param.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -82,7 +82,7 @@ enum DisplacementMethod {
 
 class Shader : public Node {
 public:
-	NODE_DECLARE;
+	NODE_DECLARE
 
 	int pass_id;
 
diff --git a/intern/cycles/render/sobol.cpp b/intern/cycles/render/sobol.cpp
index e3c2e802067..ce93dc8c5d5 100644
--- a/intern/cycles/render/sobol.cpp
+++ b/intern/cycles/render/sobol.cpp
@@ -46,10 +46,10 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
-#include "sobol.h"
+#include "render/sobol.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/sobol.h b/intern/cycles/render/sobol.h
index 574f148b9a2..9fbce4e14a5 100644
--- a/intern/cycles/render/sobol.h
+++ b/intern/cycles/render/sobol.h
@@ -17,7 +17,7 @@
 #ifndef __SOBOL_H__
 #define __SOBOL_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index 955b892f4c3..4cb4018e2b4 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -14,20 +14,20 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "graph.h"
-#include "light.h"
-#include "mesh.h"
-#include "nodes.h"
-#include "scene.h"
-#include "shader.h"
-#include "svm.h"
-
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_foreach.h"
-#include "util_progress.h"
-#include "util_task.h"
+#include "device/device.h"
+#include "render/graph.h"
+#include "render/light.h"
+#include "render/mesh.h"
+#include "render/nodes.h"
+#include "render/scene.h"
+#include "render/shader.h"
+#include "render/svm.h"
+
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_foreach.h"
+#include "util/util_progress.h"
+#include "util/util_task.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -813,7 +813,6 @@ void SVMCompiler::compile(Scene *scene,
 		scoped_timer timer((summary != NULL)? &summary->time_finalize: NULL);
 		shader->graph->finalize(scene,
 		                        false,
-		                        false,
 		                        shader->has_integrator_dependency);
 	}
 
@@ -821,7 +820,6 @@ void SVMCompiler::compile(Scene *scene,
 		scoped_timer timer((summary != NULL)? &summary->time_finalize_bump: NULL);
 		shader->graph_bump->finalize(scene,
 		                             true,
-		                             false,
 		                             shader->has_integrator_dependency,
 		                             shader->displacement_method == DISPLACE_BOTH);
 	}
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index a501b6bc8b1..abbd9e50610 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -17,13 +17,13 @@
 #ifndef __SVM_H__
 #define __SVM_H__
 
-#include "attribute.h"
-#include "graph.h"
-#include "shader.h"
+#include "render/attribute.h"
+#include "render/graph.h"
+#include "render/shader.h"
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_thread.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tables.cpp b/intern/cycles/render/tables.cpp
index dfafd99961b..bf1ef12d602 100644
--- a/intern/cycles/render/tables.cpp
+++ b/intern/cycles/render/tables.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "device.h"
-#include "scene.h"
-#include "tables.h"
+#include "device/device.h"
+#include "render/scene.h"
+#include "render/tables.h"
 
-#include "util_debug.h"
-#include "util_logging.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tables.h b/intern/cycles/render/tables.h
index 1bb70b22762..bc261c2a74d 100644
--- a/intern/cycles/render/tables.h
+++ b/intern/cycles/render/tables.h
@@ -17,7 +17,7 @@
 #ifndef __TABLES_H__
 #define __TABLES_H__
 
-#include <util_list.h>
+#include "util/util_list.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index a493c3fa1cd..944e746ca2d 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "tile.h"
+#include "render/tile.h"
 
-#include "util_algorithm.h"
-#include "util_types.h"
+#include "util/util_algorithm.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -131,7 +131,6 @@ void TileManager::reset(BufferParams& params_, int num_samples_)
 	state.buffer = BufferParams();
 	state.sample = range_start_sample - 1;
 	state.num_tiles = 0;
-	state.num_rendered_tiles = 0;
 	state.num_samples = 0;
 	state.resolution_divider = get_divider(params.width, params.height, start_resolution);
 	state.tiles.clear();
@@ -343,7 +342,6 @@ bool TileManager::next_tile(Tile& tile, int device)
 
 	tile = Tile(state.tiles[logical_device].front());
 	state.tiles[logical_device].pop_front();
-	state.num_rendered_tiles++;
 	return true;
 }
 
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 5d92ebac355..622b89f7670 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -19,8 +19,8 @@
 
 #include <limits.h>
 
-#include "buffers.h"
-#include "util_list.h"
+#include "render/buffers.h"
+#include "util/util_list.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -63,7 +63,6 @@ public:
 		int num_samples;
 		int resolution_divider;
 		int num_tiles;
-		int num_rendered_tiles;
 
 		/* Total samples over all pixels: Generally num_samples*num_pixels,
 		 * but can be higher due to the initial resolution division for previews. */
diff --git a/intern/cycles/subd/CMakeLists.txt b/intern/cycles/subd/CMakeLists.txt
index dafb807bdf3..fe0c221ab0d 100644
--- a/intern/cycles/subd/CMakeLists.txt
+++ b/intern/cycles/subd/CMakeLists.txt
@@ -1,11 +1,6 @@
 
 set(INC
-	.
-	../graph
-	../kernel
-	../kernel/svm
-	../render
-	../util
+	..
 )
 
 set(INC_SYS
diff --git a/intern/cycles/subd/subd_dice.cpp b/intern/cycles/subd/subd_dice.cpp
index a1bd349b167..fae815901ee 100644
--- a/intern/cycles/subd/subd_dice.cpp
+++ b/intern/cycles/subd/subd_dice.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
+#include "render/camera.h"
+#include "render/mesh.h"
 
-#include "subd_dice.h"
-#include "subd_patch.h"
+#include "subd/subd_dice.h"
+#include "subd/subd_patch.h"
 
-#include "util_debug.h"
+#include "util/util_debug.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_dice.h b/intern/cycles/subd/subd_dice.h
index 33d13a4ab3a..c0e32be18c4 100644
--- a/intern/cycles/subd/subd_dice.h
+++ b/intern/cycles/subd/subd_dice.h
@@ -22,8 +22,8 @@
  * DiagSplit. For more algorithm details, see the DiagSplit paper or the
  * ARB_tessellation_shader OpenGL extension, Section 2.X.2. */
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch.cpp b/intern/cycles/subd/subd_patch.cpp
index d3319c5ccf5..fa2fe2bf113 100644
--- a/intern/cycles/subd/subd_patch.cpp
+++ b/intern/cycles/subd/subd_patch.cpp
@@ -16,12 +16,12 @@
 
 /* Parts adapted from code in the public domain in NVidia Mesh Tools. */
 
-#include "mesh.h"
+#include "render/mesh.h"
 
-#include "subd_patch.h"
+#include "subd/subd_patch.h"
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch.h b/intern/cycles/subd/subd_patch.h
index 360c1abf27b..1bb81588835 100644
--- a/intern/cycles/subd/subd_patch.h
+++ b/intern/cycles/subd/subd_patch.h
@@ -17,8 +17,8 @@
 #ifndef __SUBD_PATCH_H__
 #define __SUBD_PATCH_H__
 
-#include "util_boundbox.h"
-#include "util_types.h"
+#include "util/util_boundbox.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_patch_table.cpp b/intern/cycles/subd/subd_patch_table.cpp
index d437b045c07..63bf673a90b 100644
--- a/intern/cycles/subd/subd_patch_table.cpp
+++ b/intern/cycles/subd/subd_patch_table.cpp
@@ -25,10 +25,10 @@
  *
  */
 
-#include "subd_patch_table.h"
-#include "kernel_types.h"
+#include "subd/subd_patch_table.h"
+#include "kernel/kernel_types.h"
 
-#include "util_math.h"
+#include "util/util_math.h"
 
 #ifdef WITH_OPENSUBDIV
 #include <opensubdiv/far/patchTable.h>
diff --git a/intern/cycles/subd/subd_patch_table.h b/intern/cycles/subd/subd_patch_table.h
index 3166a1691d8..907f2dd6c28 100644
--- a/intern/cycles/subd/subd_patch_table.h
+++ b/intern/cycles/subd/subd_patch_table.h
@@ -17,8 +17,8 @@
 #ifndef __SUBD_PATCH_TABLE_H__
 #define __SUBD_PATCH_TABLE_H__
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 #ifdef WITH_OPENSUBDIV
 #ifdef _MSC_VER
diff --git a/intern/cycles/subd/subd_split.cpp b/intern/cycles/subd/subd_split.cpp
index 3c91ad8ab0d..9dbfc1c4e2f 100644
--- a/intern/cycles/subd/subd_split.cpp
+++ b/intern/cycles/subd/subd_split.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
-#include "camera.h"
-#include "mesh.h"
+#include "render/camera.h"
+#include "render/mesh.h"
 
-#include "subd_dice.h"
-#include "subd_patch.h"
-#include "subd_split.h"
+#include "subd/subd_dice.h"
+#include "subd/subd_patch.h"
+#include "subd/subd_split.h"
 
-#include "util_debug.h"
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/subd/subd_split.h b/intern/cycles/subd/subd_split.h
index a2f76dd2e03..f869cc6a48e 100644
--- a/intern/cycles/subd/subd_split.h
+++ b/intern/cycles/subd/subd_split.h
@@ -22,10 +22,10 @@
  * evaluation at arbitrary points is required for this to work. See the paper
  * for more details. */
 
-#include "subd_dice.h"
+#include "subd/subd_dice.h"
 
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt
index d8abf671bd6..a015fef8284 100644
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 set(INC
-	.
+	..
 	../../glew-mx
 )
 
@@ -52,6 +52,7 @@ set(SRC_HEADERS
 	util_math.h
 	util_math_cdf.h
 	util_math_fast.h
+	util_math_intersect.h
 	util_md5.h
 	util_opengl.h
 	util_optimization.h
diff --git a/intern/cycles/util/util_aligned_malloc.cpp b/intern/cycles/util/util_aligned_malloc.cpp
index 15d2eb3271b..cc7252dcc58 100644
--- a/intern/cycles/util/util_aligned_malloc.cpp
+++ b/intern/cycles/util/util_aligned_malloc.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "util_aligned_malloc.h"
-#include "util_guarded_allocator.h"
+#include "util/util_aligned_malloc.h"
+#include "util/util_guarded_allocator.h"
 
 #include <cassert>
 
diff --git a/intern/cycles/util/util_aligned_malloc.h b/intern/cycles/util/util_aligned_malloc.h
index ecc0f28c376..cf1e86ca916 100644
--- a/intern/cycles/util/util_aligned_malloc.h
+++ b/intern/cycles/util/util_aligned_malloc.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_ALIGNED_MALLOC_H__
 #define __UTIL_ALIGNED_MALLOC_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 433e41fbbb6..6c52117ef9a 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -32,6 +32,13 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 	}
 }
 
+#define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
+
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE 0
+#define ccl_barrier(flags) (void)0
+
 #else  /* __KERNEL_GPU__ */
 
 #ifdef __KERNEL_OPENCL__
@@ -39,7 +46,7 @@ ATOMIC_INLINE void atomic_update_max_z(size_t *maximum_value, size_t value)
 /* Float atomics implementation credits:
  *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
  */
-ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *source,
+ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
                                         const float operand)
 {
 	union {
@@ -56,10 +63,29 @@ ccl_device_inline void atomic_add_and_fetch_float(volatile ccl_global float *sou
 	} while(atomic_cmpxchg((volatile ccl_global unsigned int *)source,
 	                       prev_value.int_value,
 	                       new_value.int_value) != prev_value.int_value);
+	return new_value.float_value;
 }
 
+#define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
+#define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
+
+#define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) barrier(flags)
+
 #endif  /* __KERNEL_OPENCL__ */
 
+#ifdef __KERNEL_CUDA__
+
+#define atomic_add_and_fetch_float(p, x) (atomicAdd((float*)(p), (float)(x)) + (float)(x))
+
+#define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int*)(p), (unsigned int)(x))
+#define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
+
+#define CCL_LOCAL_MEM_FENCE
+#define ccl_barrier(flags) __syncthreads()
+
+#endif  /* __KERNEL_CUDA__ */
+
 #endif  /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/util_boundbox.h b/intern/cycles/util/util_boundbox.h
index dfe4977aef3..ed94ca20211 100644
--- a/intern/cycles/util/util_boundbox.h
+++ b/intern/cycles/util/util_boundbox.h
@@ -20,10 +20,10 @@
 #include <math.h>
 #include <float.h>
 
-#include "util_math.h"
-#include "util_string.h"
-#include "util_transform.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_string.h"
+#include "util/util_transform.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_color.h b/intern/cycles/util/util_color.h
index d3598f84b94..4d673dc34d8 100644
--- a/intern/cycles/util/util_color.h
+++ b/intern/cycles/util/util_color.h
@@ -17,11 +17,11 @@
 #ifndef __UTIL_COLOR_H__
 #define __UTIL_COLOR_H__
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 #ifdef __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 80d177d2cae..9cfa57dd741 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
+#include "util/util_debug.h"
 
 #include <stdlib.h>
 
-#include "util_logging.h"
-#include "util_string.h"
+#include "util/util_logging.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -29,7 +29,8 @@ DebugFlags::CPU::CPU()
     sse41(true),
     sse3(true),
     sse2(true),
-    qbvh(true)
+    qbvh(true),
+    split_kernel(false)
 {
 	reset();
 }
@@ -55,10 +56,12 @@ void DebugFlags::CPU::reset()
 #undef CHECK_CPU_FLAGS
 
 	qbvh = true;
+	split_kernel = false;
 }
 
 DebugFlags::CUDA::CUDA()
-  : adaptive_compile(false)
+  : adaptive_compile(false),
+    split_kernel(false)
 {
 	reset();
 }
@@ -67,12 +70,15 @@ void DebugFlags::CUDA::reset()
 {
 	if(getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
 		adaptive_compile = true;
+
+	split_kernel = false;
 }
 
 DebugFlags::OpenCL::OpenCL()
   : device_type(DebugFlags::OpenCL::DEVICE_ALL),
     kernel_type(DebugFlags::OpenCL::KERNEL_DEFAULT),
-    debug(false)
+    debug(false),
+    single_program(false)
 {
 	reset();
 }
@@ -112,6 +118,7 @@ void DebugFlags::OpenCL::reset()
 	}
 	/* Initialize other flags from environment variables. */
 	debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
+	single_program = (getenv("CYCLES_OPENCL_SINGLE_PROGRAM") != NULL);
 }
 
 DebugFlags::DebugFlags()
@@ -133,7 +140,9 @@ std::ostream& operator <<(std::ostream &os,
 	   << "  AVX    : " << string_from_bool(debug_flags.cpu.avx)   << "\n"
 	   << "  SSE4.1 : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
 	   << "  SSE3   : " << string_from_bool(debug_flags.cpu.sse3)  << "\n"
-	   << "  SSE2   : " << string_from_bool(debug_flags.cpu.sse2)  << "\n";
+	   << "  SSE2   : " << string_from_bool(debug_flags.cpu.sse2)  << "\n"
+	   << "  QBVH   : " << string_from_bool(debug_flags.cpu.qbvh)  << "\n"
+	   << "  Split  : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
 
 	os << "CUDA flags:\n"
 	   << " Adaptive Compile: " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
@@ -172,9 +181,10 @@ std::ostream& operator <<(std::ostream &os,
 			break;
 	}
 	os << "OpenCL flags:\n"
-	   << "  Device type : " << opencl_device_type << "\n"
-	   << "  Kernel type : " << opencl_kernel_type << "\n"
-	   << "  Debug       : " << string_from_bool(debug_flags.opencl.debug)
+	   << "  Device type    : " << opencl_device_type << "\n"
+	   << "  Kernel type    : " << opencl_kernel_type << "\n"
+	   << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
+	   << "  Signle program : " << string_from_bool(debug_flags.opencl.single_program)
 	   << "\n";
 	return os;
 }
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index 73fd228b5d9..4505d584490 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -20,7 +20,7 @@
 #include <cassert>
 #include <iostream>
 
-#include "util_static_assert.h"
+#include "util/util_static_assert.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -46,6 +46,9 @@ public:
 
 		/* Whether QBVH usage is allowed or not. */
 		bool qbvh;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};
 
 	/* Descriptor of CUDA feature-set to be used. */
@@ -58,6 +61,9 @@ public:
 		/* Whether adaptive feature based runtime compile is enabled or not.
 		 * Requires the CUDA Toolkit and only works on Linux atm. */
 		bool adaptive_compile;
+
+		/* Whether split kernel is used */
+		bool split_kernel;
 	};
 
 	/* Descriptor of OpenCL feature-set to be used. */
@@ -106,6 +112,9 @@ public:
 
 		/* Use debug version of the kernel. */
 		bool debug;
+
+		/* Use single program */
+		bool single_program;
 	};
 
 	/* Get instance of debug flags registry. */
diff --git a/intern/cycles/util/util_guarded_allocator.cpp b/intern/cycles/util/util_guarded_allocator.cpp
index 615ac95f324..54fa6a80df5 100644
--- a/intern/cycles/util/util_guarded_allocator.cpp
+++ b/intern/cycles/util/util_guarded_allocator.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "util_guarded_allocator.h"
-#include "util_stats.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_stats.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_guarded_allocator.h b/intern/cycles/util/util_guarded_allocator.h
index 78453d214be..5f9dcfb2481 100644
--- a/intern/cycles/util/util_guarded_allocator.h
+++ b/intern/cycles/util/util_guarded_allocator.h
@@ -20,8 +20,8 @@
 #include <cstddef>
 #include <memory>
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
 #ifdef WITH_BLENDER_GUARDEDALLOC
 #  include "../../guardedalloc/MEM_guardedalloc.h"
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index 5db3384cda4..612228dd1c1 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -17,10 +17,11 @@
 #ifndef __UTIL_HALF_H__
 #define __UTIL_HALF_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
+#include "util/util_math.h"
 
 #ifdef __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -110,6 +111,28 @@ ccl_device_inline float4 half4_to_float4(half4 h)
 	return f;
 }
 
+ccl_device_inline half float_to_half(float f)
+{
+	const uint u = __float_as_uint(f);
+	/* Sign bit, shifted to it's position. */
+	uint sign_bit = u & 0x80000000;
+	sign_bit >>= 16;
+	/* Exponent. */
+	uint exponent_bits = u & 0x7f800000;
+	/* Non-sign bits. */
+	uint value_bits = u & 0x7fffffff;
+	value_bits >>= 13;  /* Align mantissa on MSB. */
+	value_bits -= 0x1c000;  /* Adjust bias. */
+	/* Flush-to-zero. */
+	value_bits = (exponent_bits < 0x38800000) ? 0 : value_bits;
+	/* Clamp-to-max. */
+	value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits;
+	/* Denormals-as-zero. */
+	value_bits = (exponent_bits == 0 ? 0 : value_bits);
+	/* Re-insert sign bit and return. */
+	return (value_bits | sign_bit);
+}
+
 #endif
 
 #endif
diff --git a/intern/cycles/util/util_hash.h b/intern/cycles/util/util_hash.h
index 98c3a681ff2..a30b7fe288e 100644
--- a/intern/cycles/util/util_hash.h
+++ b/intern/cycles/util/util_hash.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_HASH_H__
 #define __UTIL_HASH_H__
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_image.h b/intern/cycles/util/util_image.h
index c8efc551d97..18876841b5b 100644
--- a/intern/cycles/util/util_image.h
+++ b/intern/cycles/util/util_image.h
@@ -21,7 +21,7 @@
 
 #include <OpenImageIO/imageio.h>
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -42,4 +42,4 @@ CCL_NAMESPACE_END
 
 #endif /* __UTIL_IMAGE_H__ */
 
-#include "util_image_impl.h"
+#include "util/util_image_impl.h"
diff --git a/intern/cycles/util/util_image_impl.h b/intern/cycles/util/util_image_impl.h
index 73ecfda0855..a0f9c66f979 100644
--- a/intern/cycles/util/util_image_impl.h
+++ b/intern/cycles/util/util_image_impl.h
@@ -17,9 +17,10 @@
 #ifndef __UTIL_IMAGE_IMPL_H__
 #define __UTIL_IMAGE_IMPL_H__
 
-#include "util_algorithm.h"
-#include "util_debug.h"
-#include "util_image.h"
+#include "util/util_algorithm.h"
+#include "util/util_debug.h"
+#include "util/util_half.h"
+#include "util/util_image.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -38,6 +39,52 @@ const T *util_image_read(const vector<T>& pixels,
 	return &pixels[index];
 }
 
+/* Cast input pixel from unknown storage to float. */
+template<typename T>
+inline float cast_to_float(T value);
+
+template<>
+inline float cast_to_float(float value)
+{
+	return value;
+}
+template<>
+inline float cast_to_float(uchar value)
+{
+	return (float)value / 255.0f;
+}
+template<>
+inline float cast_to_float(half value)
+{
+	return half_to_float(value);
+}
+
+/* Cast float value to output pixel type. */
+template<typename T>
+inline T cast_from_float(float value);
+
+template<>
+inline float cast_from_float(float value)
+{
+	return value;
+}
+template<>
+inline uchar cast_from_float(float value)
+{
+	if(value < 0.0f) {
+		return 0;
+	}
+	else if(value > (1.0f - 0.5f / 255.0f)) {
+		return 255;
+	}
+	return (uchar)((255.0f * value) + 0.5f);
+}
+template<>
+inline half cast_from_float(float value)
+{
+	return float_to_half(value);
+}
+
 template<typename T>
 void util_image_downscale_sample(const vector<T>& pixels,
                                  const size_t width,
@@ -71,15 +118,22 @@ void util_image_downscale_sample(const vector<T>& pixels,
 				                                 components,
 				                                 nx, ny, nz);
 				for(size_t k = 0; k < components; ++k) {
-					accum[k] += pixel[k];
+					accum[k] += cast_to_float(pixel[k]);
 				}
 				++count;
 			}
 		}
 	}
-	const float inv_count = 1.0f / (float)count;
-	for(size_t k = 0; k < components; ++k) {
-		result[k] = T(accum[k] * inv_count);
+	if(count != 0) {
+		const float inv_count = 1.0f / (float)count;
+		for(size_t k = 0; k < components; ++k) {
+			result[k] = cast_from_float<T>(accum[k] * inv_count);
+		}
+	}
+	else {
+		for(size_t k = 0; k < components; ++k) {
+			result[k] = T(0.0f);
+		}
 	}
 }
 
diff --git a/intern/cycles/util/util_logging.cpp b/intern/cycles/util/util_logging.cpp
index 03041723e15..a5a3bd34fff 100644
--- a/intern/cycles/util/util_logging.cpp
+++ b/intern/cycles/util/util_logging.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <util_logging.h>
+#include "util/util_logging.h"
 
-#include "util_math.h"
+#include "util/util_math.h"
 
 #include <stdio.h>
 #ifdef _MSC_VER
@@ -69,6 +69,15 @@ void util_logging_verbosity_set(int verbosity)
 }
 
 std::ostream& operator <<(std::ostream &os,
+                          const int2 &value)
+{
+	os << "(" << value.x
+	   << ", " << value.y
+	   << ")";
+	return os;
+}
+
+std::ostream& operator <<(std::ostream &os,
                           const float3 &value)
 {
 	os << "(" << value.x
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index 2aa9c25b1a0..ecf9c9cfee0 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -45,6 +45,7 @@ public:
 
 #define VLOG_ONCE(level, flag) if(!flag) flag = true, VLOG(level)
 
+struct int2;
 struct float3;
 
 void util_logging_init(const char *argv0);
@@ -52,6 +53,8 @@ void util_logging_start(void);
 void util_logging_verbosity_set(int verbosity);
 
 std::ostream& operator <<(std::ostream &os,
+                          const int2 &value);
+std::ostream& operator <<(std::ostream &os,
                           const float3 &value);
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index 2b81c8c498a..e0305b978b9 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -35,7 +35,7 @@
 
 #endif
 
-#include "util_types.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,41 +43,41 @@ CCL_NAMESPACE_BEGIN
 
 /* Division */
 #ifndef M_PI_F
-#define M_PI_F		((float)3.14159265358979323846264338327950288) 		/* pi */
+#define M_PI_F    (3.1415926535897932f)  /* pi */
 #endif
 #ifndef M_PI_2_F
-#define M_PI_2_F	((float)1.57079632679489661923132169163975144) 		/* pi/2 */
+#define M_PI_2_F  (1.5707963267948966f)  /* pi/2 */
 #endif
 #ifndef M_PI_4_F
-#define M_PI_4_F	((float)0.785398163397448309615660845819875721) 	/* pi/4 */
+#define M_PI_4_F  (0.7853981633974830f)  /* pi/4 */
 #endif
 #ifndef M_1_PI_F
-#define M_1_PI_F	((float)0.318309886183790671537767526745028724) 	/* 1/pi */
+#define M_1_PI_F  (0.3183098861837067f)  /* 1/pi */
 #endif
 #ifndef M_2_PI_F
-#define M_2_PI_F	((float)0.636619772367581343075535053490057448) 	/* 2/pi */
+#define M_2_PI_F  (0.6366197723675813f)  /* 2/pi */
 #endif
 
 /* Multiplication */
 #ifndef M_2PI_F
-#define M_2PI_F		((float)6.283185307179586476925286766559005768)		/* 2*pi */
+#define M_2PI_F   (6.2831853071795864f)  /* 2*pi */
 #endif
 #ifndef M_4PI_F
-#define M_4PI_F		((float)12.56637061435917295385057353311801153)		/* 4*pi */
+#define M_4PI_F   (12.566370614359172f)  /* 4*pi */
 #endif
 
 /* Float sqrt variations */
 
 #ifndef M_SQRT2_F
-#define M_SQRT2_F	((float)1.41421356237309504880) 					/* sqrt(2) */
+#define M_SQRT2_F (1.4142135623730950f)  /* sqrt(2) */
 #endif
 
 #ifndef M_LN2_F
-#define M_LN2_F      ((float)0.6931471805599453)        /* ln(2) */
+#define M_LN2_F   (0.6931471805599453f)  /* ln(2) */
 #endif
 
 #ifndef M_LN10_F
-#define M_LN10_F     ((float)2.3025850929940457)        /* ln(10) */
+#define M_LN10_F  (2.3025850929940457f)  /* ln(10) */
 #endif
 
 /* Scalar */
@@ -774,6 +774,7 @@ template<size_t index_0, size_t index_1, size_t index_2, size_t index_3> __force
 	return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
 }
 
+#if defined(__KERNEL_SSE3__)
 template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4& b)
 {
 	return _mm_moveldup_ps(b);
@@ -783,6 +784,7 @@ template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4& b)
 {
 	return _mm_movehdup_ps(b);
 }
+#endif
 
 template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4& b)
 {
@@ -1241,19 +1243,6 @@ ccl_device_inline float __uint_as_float(uint i)
 	return u.f;
 }
 
-/* Versions of functions which are safe for fast math. */
-ccl_device_inline bool isnan_safe(float f)
-{
-	unsigned int x = __float_as_uint(f);
-	return (x << 1) > 0xff000000u;
-}
-
-ccl_device_inline bool isfinite_safe(float f)
-{
-	/* By IEEE 754 rule, 2*Inf equals Inf */
-	unsigned int x = __float_as_uint(f);
-	return (f == f) && (x == 0 || (f != 2.0f*f));
-}
 
 /* Interpolation */
 
@@ -1271,6 +1260,20 @@ ccl_device_inline float triangle_area(const float3& v1, const float3& v2, const
 
 #endif
 
+/* Versions of functions which are safe for fast math. */
+ccl_device_inline bool isnan_safe(float f)
+{
+	unsigned int x = __float_as_uint(f);
+	return (x << 1) > 0xff000000u;
+}
+
+ccl_device_inline bool isfinite_safe(float f)
+{
+	/* By IEEE 754 rule, 2*Inf equals Inf */
+	unsigned int x = __float_as_uint(f);
+	return (f == f) && (x == 0 || (f != 2.0f*f)) && !((x << 1) > 0xff000000u);
+}
+
 /* Orthonormal vectors */
 
 ccl_device_inline void make_orthonormals(const float3 N, float3 *a, float3 *b)
@@ -1329,7 +1332,7 @@ ccl_device_inline float3 safe_divide_even_color(float3 a, float3 b)
 	y = (b.y != 0.0f)? a.y/b.y: 0.0f;
 	z = (b.z != 0.0f)? a.z/b.z: 0.0f;
 
-	/* try to get grey even if b is zero */
+	/* try to get gray even if b is zero */
 	if(b.x == 0.0f) {
 		if(b.y == 0.0f) {
 			x = z;
@@ -1451,181 +1454,9 @@ ccl_device_inline float beta(float x, float y)
 #endif
 }
 
-/* Ray Intersection */
-
-ccl_device bool ray_sphere_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 sphere_P, float sphere_radius,
-	float3 *isect_P, float *isect_t)
-{
-	float3 d = sphere_P - ray_P;
-	float radiussq = sphere_radius*sphere_radius;
-	float tsq = dot(d, d);
-
-	if(tsq > radiussq) { /* ray origin outside sphere */
-		float tp = dot(d, ray_D);
-
-		if(tp < 0.0f) /* dir points away from sphere */
-			return false;
-
-		float dsq = tsq - tp*tp; /* pythagoras */
-
-		if(dsq > radiussq) /* closest point on ray outside sphere */
-			return false;
-
-		float t = tp - sqrtf(radiussq - dsq); /* pythagoras */
-
-		if(t < ray_t) {
-			*isect_t = t;
-			*isect_P = ray_P + ray_D*t;
-			return true;
-		}
-	}
-
-	return false;
-}
-
-ccl_device bool ray_aligned_disk_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 disk_P, float disk_radius,
-	float3 *isect_P, float *isect_t)
+ccl_device_inline float xor_signmask(float x, int y)
 {
-	/* aligned disk normal */
-	float disk_t;
-	float3 disk_N = normalize_len(ray_P - disk_P, &disk_t);
-	float div = dot(ray_D, disk_N);
-
-	if(UNLIKELY(div == 0.0f))
-		return false;
-
-	/* compute t to intersection point */
-	float t = -disk_t/div;
-	if(t < 0.0f || t > ray_t)
-		return false;
-	
-	/* test if within radius */
-	float3 P = ray_P + ray_D*t;
-	if(len_squared(P - disk_P) > disk_radius*disk_radius)
-		return false;
-
-	*isect_P = P;
-	*isect_t = t;
-
-	return true;
-}
-
-ccl_device bool ray_triangle_intersect(
-	float3 ray_P, float3 ray_D, float ray_t,
-	float3 v0, float3 v1, float3 v2,
-	float3 *isect_P, float *isect_t)
-{
-	/* Calculate intersection */
-	float3 e1 = v1 - v0;
-	float3 e2 = v2 - v0;
-	float3 s1 = cross(ray_D, e2);
-
-	const float divisor = dot(s1, e1);
-	if(UNLIKELY(divisor == 0.0f))
-		return false;
-
-	const float invdivisor = 1.0f/divisor;
-
-	/* compute first barycentric coordinate */
-	const float3 d = ray_P - v0;
-	const float u = dot(d, s1)*invdivisor;
-	if(u < 0.0f)
-		return false;
-
-	/* Compute second barycentric coordinate */
-	const float3 s2 = cross(d, e1);
-	const float v = dot(ray_D, s2)*invdivisor;
-	if(v < 0.0f)
-		return false;
-
-	const float b0 = 1.0f - u - v;
-	if(b0 < 0.0f)
-		return false;
-
-	/* compute t to intersection point */
-	const float t = dot(e2, s2)*invdivisor;
-	if(t < 0.0f || t > ray_t)
-		return false;
-
-	*isect_t = t;
-	*isect_P = ray_P + ray_D*t;
-
-	return true;
-}
-
-ccl_device_inline bool ray_triangle_intersect_uv(
-        float3 ray_P, float3 ray_D, float ray_t,
-        float3 v0, float3 v1, float3 v2,
-        float *isect_u, float *isect_v, float *isect_t)
-{
-	/* Calculate intersection */
-	float3 e1 = v1 - v0;
-	float3 e2 = v2 - v0;
-	float3 s1 = cross(ray_D, e2);
-
-	const float divisor = dot(s1, e1);
-	if(UNLIKELY(divisor == 0.0f))
-		return false;
-
-	const float invdivisor = 1.0f/divisor;
-
-	/* compute first barycentric coordinate */
-	const float3 d = ray_P - v0;
-	const float u = dot(d, s1)*invdivisor;
-	if(u < 0.0f)
-		return false;
-
-	/* Compute second barycentric coordinate */
-	const float3 s2 = cross(d, e1);
-	const float v = dot(ray_D, s2)*invdivisor;
-	if(v < 0.0f)
-		return false;
-
-	const float b0 = 1.0f - u - v;
-	if(b0 < 0.0f)
-		return false;
-
-	/* compute t to intersection point */
-	const float t = dot(e2, s2)*invdivisor;
-	if(t < 0.0f || t > ray_t)
-		return false;
-
-	*isect_u = u;
-	*isect_v = v;
-	*isect_t = t;
-
-	return true;
-}
-
-ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D, float ray_mint, float ray_maxt,
-                                   float3 quad_P, float3 quad_u, float3 quad_v, float3 quad_n,
-                                   float3 *isect_P, float *isect_t, float *isect_u, float *isect_v)
-{
-	float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
-	if(t < ray_mint || t > ray_maxt)
-		return false;
-
-	float3 hit = ray_P + t*ray_D;
-	float3 inplane = hit - quad_P;
-
-	float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f;
-	if(u < 0.0f || u > 1.0f)
-		return false;
-
-	float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f;
-	if(v < 0.0f || v > 1.0f)
-		return false;
-
-	if(isect_P) *isect_P = hit;
-	if(isect_t) *isect_t = t;
-	if(isect_u) *isect_u = u;
-	if(isect_v) *isect_v = v;
-
-	return true;
+	return __int_as_float(__float_as_int(x) ^ y);
 }
 
 /* projections */
@@ -1690,4 +1521,3 @@ ccl_device_inline int util_max_axis(float3 vec)
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_H__ */
-
diff --git a/intern/cycles/util/util_math_cdf.cpp b/intern/cycles/util/util_math_cdf.cpp
index ec78ca15d88..c14d4793ea1 100644
--- a/intern/cycles/util/util_math_cdf.cpp
+++ b/intern/cycles/util/util_math_cdf.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "util_math_cdf.h"
+#include "util/util_math_cdf.h"
 
-#include "util_algorithm.h"
-#include "util_math.h"
+#include "util/util_algorithm.h"
+#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_math_cdf.h b/intern/cycles/util/util_math_cdf.h
index 47dfb68ba44..79643fe26e3 100644
--- a/intern/cycles/util/util_math_cdf.h
+++ b/intern/cycles/util/util_math_cdf.h
@@ -17,9 +17,9 @@
 #ifndef __UTIL_MATH_CDF_H__
 #define __UTIL_MATH_CDF_H__
 
-#include "util_algorithm.h"
-#include "util_math.h"
-#include "util_vector.h"
+#include "util/util_algorithm.h"
+#include "util/util_math.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h
new file mode 100644
index 00000000000..2b65a0dfa48
--- /dev/null
+++ b/intern/cycles/util/util_math_intersect.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_MATH_INTERSECT_H__
+#define __UTIL_MATH_INTERSECT_H__
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray Intersection */
+
+ccl_device bool ray_sphere_intersect(
+        float3 ray_P, float3 ray_D, float ray_t,
+        float3 sphere_P, float sphere_radius,
+        float3 *isect_P, float *isect_t)
+{
+	const float3 d = sphere_P - ray_P;
+	const float radiussq = sphere_radius*sphere_radius;
+	const float tsq = dot(d, d);
+
+	if(tsq > radiussq) {
+		/* Ray origin outside sphere. */
+		const float tp = dot(d, ray_D);
+		if(tp < 0.0f) {
+			/* Ray  points away from sphere. */
+			return false;
+		}
+		const float dsq = tsq - tp*tp;  /* pythagoras */
+		if(dsq > radiussq)  {
+			/* Closest point on ray outside sphere. */
+			return false;
+		}
+		const float t = tp - sqrtf(radiussq - dsq);  /* pythagoras */
+		if(t < ray_t) {
+			*isect_t = t;
+			*isect_P = ray_P + ray_D*t;
+			return true;
+		}
+	}
+	return false;
+}
+
+ccl_device bool ray_aligned_disk_intersect(
+        float3 ray_P, float3 ray_D, float ray_t,
+        float3 disk_P, float disk_radius,
+        float3 *isect_P, float *isect_t)
+{
+	/* Aligned disk normal. */
+	float disk_t;
+	const float3 disk_N = normalize_len(ray_P - disk_P, &disk_t);
+	const float div = dot(ray_D, disk_N);
+	if(UNLIKELY(div == 0.0f)) {
+		return false;
+	}
+	/* Compute t to intersection point. */
+	const float t = -disk_t/div;
+	if(t < 0.0f || t > ray_t) {
+		return false;
+	}
+	/* Test if within radius. */
+	float3 P = ray_P + ray_D*t;
+	if(len_squared(P - disk_P) > disk_radius*disk_radius) {
+		return false;
+	}
+	*isect_P = P;
+	*isect_t = t;
+	return true;
+}
+
+#if defined(__KERNEL_CUDA__) && __CUDA_ARCH__ < 300
+ccl_device_inline
+#else
+ccl_device_forceinline
+#endif
+bool ray_triangle_intersect(
+        float3 ray_P, float3 ray_dir, float ray_t,
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+        const ssef *ssef_verts,
+#else
+        const float3 tri_a, const float3 tri_b, const float3 tri_c,
+#endif
+        float *isect_u, float *isect_v, float *isect_t)
+{
+#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
+	typedef ssef float3;
+	const float3 tri_a(ssef_verts[0]);
+	const float3 tri_b(ssef_verts[1]);
+	const float3 tri_c(ssef_verts[2]);
+	const float3 P(ray_P);
+	const float3 dir(ray_dir);
+#else
+#  define dot3(a, b) dot(a, b)
+	const float3 P = ray_P;
+	const float3 dir = ray_dir;
+#endif
+
+	/* Calculate vertices relative to ray origin. */
+	const float3 v0 = tri_c - P;
+	const float3 v1 = tri_a - P;
+	const float3 v2 = tri_b - P;
+
+	/* Calculate triangle edges. */
+	const float3 e0 = v2 - v0;
+	const float3 e1 = v0 - v1;
+	const float3 e2 = v1 - v2;
+
+	/* Perform edge tests. */
+#ifdef __KERNEL_SSE2__
+	const float3 crossU = cross(v2 + v0, e0);
+	const float3 crossV = cross(v0 + v1, e1);
+	const float3 crossW = cross(v1 + v2, e2);
+#  ifndef __KERNEL_SSE__
+	const ssef crossX(crossU.x, crossV.x, crossW.x, crossW.x);
+	const ssef crossY(crossU.y, crossV.y, crossW.y, crossW.y);
+	const ssef crossZ(crossU.z, crossV.z, crossW.z, crossW.z);
+#  else
+	ssef crossX(crossU);
+	ssef crossY(crossV);
+	ssef crossZ(crossW);
+	ssef zero = _mm_setzero_ps();
+	_MM_TRANSPOSE4_PS(crossX, crossY, crossZ, zero);
+#  endif
+	const ssef dirX(ray_dir.x);
+	const ssef dirY(ray_dir.y);
+	const ssef dirZ(ray_dir.z);
+	/*const*/ ssef UVWW = crossX*dirX + crossY*dirY + crossZ*dirZ;
+	const float minUVW = reduce_min(UVWW);
+	const float maxUVW = reduce_max(UVWW);
+#else  /* __KERNEL_SSE2__ */
+	const float U = dot(cross(v2 + v0, e0), ray_dir);
+	const float V = dot(cross(v0 + v1, e1), ray_dir);
+	const float W = dot(cross(v1 + v2, e2), ray_dir);
+	const float minUVW = min(U, min(V, W));
+	const float maxUVW = max(U, max(V, W));
+#endif  /* __KERNEL_SSE2__ */
+
+	if(minUVW < 0.0f && maxUVW > 0.0f) {
+		return false;
+	}
+
+	/* Calculate geometry normal and denominator. */
+	const float3 Ng1 = cross(e1, e0);
+	//const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
+	const float3 Ng = Ng1 + Ng1;
+	const float den = dot3(Ng, dir);
+	/* Avoid division by 0. */
+	if(UNLIKELY(den == 0.0f)) {
+		return false;
+	}
+
+	/* Perform depth test. */
+	const float T = dot3(v0, Ng);
+	const int sign_den = (__float_as_int(den) & 0x80000000);
+	const float sign_T = xor_signmask(T, sign_den);
+	if((sign_T < 0.0f) ||
+	   (sign_T > ray_t * xor_signmask(den, sign_den)))
+	{
+		return false;
+	}
+
+	const float inv_den = 1.0f / den;
+#ifdef __KERNEL_SSE2__
+	UVWW *= inv_den;
+	_mm_store_ss(isect_u, UVWW);
+	_mm_store_ss(isect_v, shuffle<1,1,3,3>(UVWW));
+#else
+	*isect_u = U * inv_den;
+	*isect_v = V * inv_den;
+#endif
+	*isect_t = T * inv_den;
+	return true;
+
+#undef dot3
+}
+
+ccl_device bool ray_quad_intersect(float3 ray_P, float3 ray_D,
+                                   float ray_mint, float ray_maxt,
+                                   float3 quad_P,
+                                   float3 quad_u, float3 quad_v, float3 quad_n,
+                                   float3 *isect_P, float *isect_t,
+                                   float *isect_u, float *isect_v)
+{
+	/* Perform intersection test. */
+	float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
+	if(t < ray_mint || t > ray_maxt) {
+		return false;
+	}
+	const float3 hit = ray_P + t*ray_D;
+	const float3 inplane = hit - quad_P;
+	const float u = dot(inplane, quad_u) / dot(quad_u, quad_u) + 0.5f;
+	if(u < 0.0f || u > 1.0f) {
+		return false;
+	}
+	const float v = dot(inplane, quad_v) / dot(quad_v, quad_v) + 0.5f;
+	if(v < 0.0f || v > 1.0f) {
+		return false;
+	}
+	/* Store the result. */
+	/* TODO(sergey): Check whether we can avoid some checks here. */
+	if(isect_P != NULL) *isect_P = hit;
+	if(isect_t != NULL) *isect_t = t;
+	if(isect_u != NULL) *isect_u = u;
+	if(isect_v != NULL) *isect_v = v;
+	return true;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_INTERSECT_H__ */
diff --git a/intern/cycles/util/util_md5.h b/intern/cycles/util/util_md5.h
index d0af9fdb004..e4cd66c85b0 100644
--- a/intern/cycles/util/util_md5.h
+++ b/intern/cycles/util/util_md5.h
@@ -30,8 +30,8 @@
 #ifndef __UTIL_MD5_H__
 #define __UTIL_MD5_H__
 
-#include "util_string.h"
-#include "util_types.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index adc141a7b28..6f70a474fe7 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -111,7 +111,7 @@
 
 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
  * Since we can't avoid including <windows.h>, better only include that */
-#include "util_windows.h"
+#include "util/util_windows.h"
 
 #endif
 
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 5df262fcbbb..cd3067f7650 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
-#include "util_md5.h"
-#include "util_path.h"
-#include "util_string.h"
+#include "util/util_debug.h"
+#include "util/util_md5.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
 
 #include <OpenImageIO/filesystem.h>
 #include <OpenImageIO/strutil.h>
@@ -45,7 +45,7 @@ OIIO_NAMESPACE_USING
 #  include <shlwapi.h>
 #endif
 
-#include "util_windows.h"
+#include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -320,17 +320,18 @@ static char *path_specials(const string& sub)
 {
 	static bool env_init = false;
 	static char *env_shader_path;
-	static char *env_kernel_path;
+	static char *env_source_path;
 	if(!env_init) {
 		env_shader_path = getenv("CYCLES_SHADER_PATH");
-		env_kernel_path = getenv("CYCLES_KERNEL_PATH");
+		/* NOTE: It is KERNEL in env variable for compatibility reasons. */
+		env_source_path = getenv("CYCLES_KERNEL_PATH");
 		env_init = true;
 	}
 	if(env_shader_path != NULL && sub == "shader") {
 		return env_shader_path;
 	}
-	else if(env_shader_path != NULL && sub == "kernel") {
-		return env_kernel_path;
+	else if(env_shader_path != NULL && sub == "source") {
+		return env_source_path;
 	}
 	return NULL;
 }
@@ -814,7 +815,7 @@ string path_source_replace_includes(const string& source,
 						/* Use line directives for better error messages. */
 						line = line_directive(filepath, 1)
 						     + token.replace(0, n_end + 1, "\n" + text + "\n")
-						     + line_directive(path_join(path, source_filename), i);
+						     + line_directive(path_join(path, source_filename), i + 1);
 					}
 				}
 			}
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 70dbb5ae403..0e5e2d2c837 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -24,10 +24,10 @@
 
 #include <stdio.h>
 
-#include "util_set.h"
-#include "util_string.h"
-#include "util_types.h"
-#include "util_vector.h"
+#include "util/util_set.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 14215056840..39c1eed04e7 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -23,10 +23,10 @@
  * update notifications from a job running in another thread. All methods
  * except for the constructor/destructor are thread safe. */
 
-#include "util_function.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_thread.h"
+#include "util/util_function.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_thread.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_simd.cpp b/intern/cycles/util/util_simd.cpp
index de2df612578..f90439c188b 100644
--- a/intern/cycles/util/util_simd.cpp
+++ b/intern/cycles/util/util_simd.cpp
@@ -19,7 +19,7 @@
     (defined(WITH_KERNEL_NATIVE) && defined(__SSE2__))
 
 #define __KERNEL_SSE2__
-#include "util_simd.h"
+#include "util/util_simd.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 756bd15ed25..557809a5719 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -20,8 +20,8 @@
 
 #include <limits>
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -451,11 +451,11 @@ ccl_device_inline int bitscan(int value)
 
 CCL_NAMESPACE_END
 
-#include "util_math.h"
-#include "util_sseb.h"
-#include "util_ssei.h"
-#include "util_ssef.h"
-#include "util_avxf.h"
+#include "util/util_math.h"
+#include "util/util_sseb.h"
+#include "util/util_ssei.h"
+#include "util/util_ssef.h"
+#include "util/util_avxf.h"
 
 #endif /* __UTIL_SIMD_TYPES_H__ */
 
diff --git a/intern/cycles/util/util_sky_model.cpp b/intern/cycles/util/util_sky_model.cpp
index 5730986cc4f..6dda8469907 100644
--- a/intern/cycles/util/util_sky_model.cpp
+++ b/intern/cycles/util/util_sky_model.cpp
@@ -97,8 +97,8 @@ All instructions on how to use this code are in the accompanying header file.
 
 */
 
-#include "util_sky_model.h"
-#include "util_sky_model_data.h"
+#include "util/util_sky_model.h"
+#include "util/util_sky_model_data.h"
 
 #include <assert.h>
 #include <stdio.h>
diff --git a/intern/cycles/util/util_ssef.h b/intern/cycles/util/util_ssef.h
index 2f5295b5463..cf99a08efae 100644
--- a/intern/cycles/util/util_ssef.h
+++ b/intern/cycles/util/util_ssef.h
@@ -514,12 +514,12 @@ ccl_device_inline float len3(const ssef& a)
 /* faster version for SSSE3 */
 typedef ssei shuffle_swap_t;
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
 {
 	return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 }
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
 {
 	return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
 }
@@ -534,12 +534,12 @@ ccl_device_inline const ssef shuffle_swap(const ssef& a, const shuffle_swap_t& s
 /* somewhat slower version for SSE2 */
 typedef int shuffle_swap_t;
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_identity(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_identity(void)
 {
 	return 0;
 }
 
-ccl_device_inline const shuffle_swap_t shuffle_swap_swap(void)
+ccl_device_inline shuffle_swap_t shuffle_swap_swap(void)
 {
 	return 1;
 }
diff --git a/intern/cycles/util/util_stack_allocator.h b/intern/cycles/util/util_stack_allocator.h
index d7aab5b250c..2f1799a739e 100644
--- a/intern/cycles/util/util_stack_allocator.h
+++ b/intern/cycles/util/util_stack_allocator.h
@@ -20,8 +20,8 @@
 #include <cstddef>
 #include <memory>
 
-#include "util_debug.h"
-#include "util_types.h"
+#include "util/util_debug.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index 033d85e8ec6..e90049254de 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -43,7 +43,9 @@ template <> class StaticAssertFailure<true> {};
 #    endif  /* __COUNTER__ */
 #  endif  /* C++11 or MSVC2015 */
 #else  /* __KERNEL_GPU__ */
-#  define static_assert(statement, message)
+#  ifndef static_assert
+#    define static_assert(statement, message)
+#  endif
 #endif  /* __KERNEL_GPU__ */
 
 /* TODO(sergey): For until C++11 is a bare minimum for us,
diff --git a/intern/cycles/util/util_stats.h b/intern/cycles/util/util_stats.h
index c21a8488c81..baba549753d 100644
--- a/intern/cycles/util/util_stats.h
+++ b/intern/cycles/util/util_stats.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_STATS_H__
 #define __UTIL_STATS_H__
 
-#include "util_atomic.h"
+#include "util/util_atomic.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index 5594aa8edb6..a1008d510d1 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -17,9 +17,9 @@
 #include <stdarg.h>
 #include <stdio.h>
 
-#include "util_foreach.h"
-#include "util_string.h"
-#include "util_windows.h"
+#include "util/util_foreach.h"
+#include "util/util_string.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32
 #  ifndef vsnprintf
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index 7aeed96f00b..e2c105db9c1 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <sstream>
 
-#include "util_vector.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index 87d885c44cf..a942d738b8a 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "util_system.h"
+#include "util/util_system.h"
 
-#include "util_debug.h"
-#include "util_logging.h"
-#include "util_types.h"
-#include "util_string.h"
+#include "util/util_debug.h"
+#include "util/util_logging.h"
+#include "util/util_types.h"
+#include "util/util_string.h"
 
 #ifdef _WIN32
 #  if(!defined(FREE_WINDOWS))
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index ff61b260bed..db7a45b2d59 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -17,7 +17,7 @@
 #ifndef __UTIL_SYSTEM_H__
 #define __UTIL_SYSTEM_H__
 
-#include "util_string.h"
+#include "util/util_string.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index 0d1fed3ebbf..fb0c34e1dc4 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include "util_debug.h"
-#include "util_foreach.h"
-#include "util_logging.h"
-#include "util_system.h"
-#include "util_task.h"
-#include "util_time.h"
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_time.h"
 
 //#define THREADING_DEBUG_ENABLED
 
diff --git a/intern/cycles/util/util_task.h b/intern/cycles/util/util_task.h
index 0b82f14f66f..3ebfb007e40 100644
--- a/intern/cycles/util/util_task.h
+++ b/intern/cycles/util/util_task.h
@@ -17,10 +17,10 @@
 #ifndef __UTIL_TASK_H__
 #define __UTIL_TASK_H__
 
-#include "util_list.h"
-#include "util_string.h"
-#include "util_thread.h"
-#include "util_vector.h"
+#include "util/util_list.h"
+#include "util/util_string.h"
+#include "util/util_thread.h"
+#include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_thread.cpp b/intern/cycles/util/util_thread.cpp
index 3db8b4bd197..3dcb09804b0 100644
--- a/intern/cycles/util/util_thread.cpp
+++ b/intern/cycles/util/util_thread.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include "util_thread.h"
+#include "util/util_thread.h"
 
-#include "util_system.h"
-#include "util_windows.h"
+#include "util/util_system.h"
+#include "util/util_windows.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_thread.h b/intern/cycles/util/util_thread.h
index 427c633d2ce..1b4e87ebf03 100644
--- a/intern/cycles/util/util_thread.h
+++ b/intern/cycles/util/util_thread.h
@@ -32,7 +32,7 @@
 #  include <libkern/OSAtomic.h>
 #endif
 
-#include "util_function.h"
+#include "util/util_function.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_time.cpp b/intern/cycles/util/util_time.cpp
index 59c963cfafb..7c39aa294bf 100644
--- a/intern/cycles/util/util_time.cpp
+++ b/intern/cycles/util/util_time.cpp
@@ -16,8 +16,8 @@
 
 #include <stdlib.h>
 
-#include "util_time.h"
-#include "util_windows.h"
+#include "util/util_time.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32
 
diff --git a/intern/cycles/util/util_transform.cpp b/intern/cycles/util/util_transform.cpp
index 2f10540c94e..b8f182ae962 100644
--- a/intern/cycles/util/util_transform.cpp
+++ b/intern/cycles/util/util_transform.cpp
@@ -46,10 +46,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "util_transform.h"
+#include "util/util_transform.h"
 
-#include "util_boundbox.h"
-#include "util_math.h"
+#include "util/util_boundbox.h"
+#include "util/util_math.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index a0695f20488..aef168ca64d 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -21,8 +21,8 @@
 #include <string.h>
 #endif
 
-#include "util_math.h"
-#include "util_types.h"
+#include "util/util_math.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index a000fae4bd6..bf4a134b998 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -37,6 +37,9 @@
 #define ccl_device_noinline static
 #define ccl_global
 #define ccl_constant
+#define ccl_local
+#define ccl_local_param
+#define ccl_private
 #define ccl_restrict __restrict
 #define __KERNEL_WITH_SSE_ALIGN__
 
@@ -82,7 +85,7 @@
 
 /* SIMD Types */
 
-#include "util_optimization.h"
+#include "util/util_optimization.h"
 
 #endif
 
@@ -103,10 +106,16 @@ typedef unsigned int uint;
 
 #endif
 
-#ifndef __KERNEL_GPU__
-
 /* Fixed Bits Types */
 
+#ifdef __KERNEL_OPENCL__
+
+typedef ulong uint64_t;
+
+#endif
+
+#ifndef __KERNEL_GPU__
+
 #ifdef _WIN32
 
 typedef signed char int8_t;
@@ -171,7 +180,7 @@ struct ccl_try_align(16) int3 {
 	};
 
 	__forceinline int3() {}
-	__forceinline int3(const __m128i a) : m128(a) {}
+	__forceinline int3(const __m128i& a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
 
@@ -193,7 +202,7 @@ struct ccl_try_align(16) int4 {
 	};
 
 	__forceinline int4() {}
-	__forceinline int4(const __m128i a) : m128(a) {}
+	__forceinline int4(const __m128i& a) : m128(a) {}
 	__forceinline operator const __m128i&(void) const { return m128; }
 	__forceinline operator __m128i&(void) { return m128; }
 
@@ -265,7 +274,7 @@ struct ccl_try_align(16) float4 {
 	};
 
 	__forceinline float4() {}
-	__forceinline float4(const __m128 a) : m128(a) {}
+	__forceinline float4(const __m128& a) : m128(a) {}
 	__forceinline operator const __m128&(void) const { return m128; }
 	__forceinline operator __m128&(void) { return m128; }
 
@@ -397,11 +406,6 @@ ccl_device_inline float4 make_float4(float x, float y, float z, float w)
 	return a;
 }
 
-ccl_device_inline int align_up(int offset, int alignment)
-{
-	return (offset + alignment - 1) & ~(alignment - 1);
-}
-
 ccl_device_inline int3 make_int3(int i)
 {
 #ifdef __KERNEL_SSE__
@@ -476,6 +480,21 @@ ccl_device_inline int4 make_int4(const float3& f)
 
 #endif
 
+ccl_device_inline size_t align_up(size_t offset, size_t alignment)
+{
+	return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+ccl_device_inline size_t round_up(size_t x, size_t multiple)
+{
+	return ((x + multiple - 1) / multiple) * multiple;
+}
+
+ccl_device_inline size_t round_down(size_t x, size_t multiple)
+{
+	return (x / multiple) * multiple;
+}
+
 /* Interpolation types for textures
  * cuda also use texture space to store other objects */
 enum InterpolationType {
diff --git a/intern/cycles/util/util_vector.h b/intern/cycles/util/util_vector.h
index 546b17570bb..4add91a3368 100644
--- a/intern/cycles/util/util_vector.h
+++ b/intern/cycles/util/util_vector.h
@@ -23,9 +23,9 @@
 #include <cstring>
 #include <vector>
 
-#include "util_aligned_malloc.h"
-#include "util_guarded_allocator.h"
-#include "util_types.h"
+#include "util/util_aligned_malloc.h"
+#include "util/util_guarded_allocator.h"
+#include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/util/util_view.cpp b/intern/cycles/util/util_view.cpp
index 9796a5f896d..10d86167921 100644
--- a/intern/cycles/util/util_view.cpp
+++ b/intern/cycles/util/util_view.cpp
@@ -17,11 +17,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "util_opengl.h"
-#include "util_string.h"
-#include "util_time.h"
-#include "util_version.h"
-#include "util_view.h"
+#include "util/util_opengl.h"
+#include "util/util_string.h"
+#include "util/util_time.h"
+#include "util/util_version.h"
+#include "util/util_view.h"
 
 #ifdef __APPLE__
 #include <GLUT/glut.h>
diff --git a/intern/cycles/util/util_windows.cpp b/intern/cycles/util/util_windows.cpp
index 4de8483564b..073db2a27db 100644
--- a/intern/cycles/util/util_windows.cpp
+++ b/intern/cycles/util/util_windows.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "util_windows.h"
+#include "util/util_windows.h"
 
 #ifdef _WIN32
 
diff --git a/intern/decklink/CMakeLists.txt b/intern/decklink/CMakeLists.txt
index fbef65cdba4..b7727cbc32a 100644
--- a/intern/decklink/CMakeLists.txt
+++ b/intern/decklink/CMakeLists.txt
@@ -34,14 +34,14 @@ set(SRC
 	DeckLinkAPI.h
 )
 
-if (WIN32)
+if(WIN32)
 	list(APPEND SRC
 		win/DeckLinkAPI_h.h
 		win/DeckLinkAPI_i.c
 	)
 endif()
 
-if (UNIX AND NOT APPLE)
+if(UNIX AND NOT APPLE)
 	list(APPEND SRC
 		linux/DeckLinkAPI.h
 		linux/DeckLinkAPIConfiguration.h
diff --git a/intern/ffmpeg/ffmpeg_compat.h b/intern/ffmpeg/ffmpeg_compat.h
index bcfa24b06a8..d6220ebf562 100644
--- a/intern/ffmpeg/ffmpeg_compat.h
+++ b/intern/ffmpeg/ffmpeg_compat.h
@@ -350,7 +350,12 @@ int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
 FFMPEG_INLINE
 int64_t av_get_pts_from_frame(AVFormatContext *avctx, AVFrame * picture)
 {
-	int64_t pts = picture->pkt_pts;
+	int64_t pts;
+#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(55, 34, 100)
+	pts = picture->pts;
+#else
+	pts = picture->pkt_pts;
+#endif
 
 	if (pts == AV_NOPTS_VALUE) {
 		pts = picture->pkt_dts;
diff --git a/intern/ghost/GHOST_C-api.h b/intern/ghost/GHOST_C-api.h
index ff1922af4f3..6887063eae9 100644
--- a/intern/ghost/GHOST_C-api.h
+++ b/intern/ghost/GHOST_C-api.h
@@ -908,6 +908,11 @@ extern int GHOST_UseNativePixels(void);
 extern float GHOST_GetNativePixelSize(GHOST_WindowHandle windowhandle);
 
 /**
+ * Returns the suggested DPI for this window.
+ */
+extern GHOST_TUns16 GHOST_GetDPIHint(GHOST_WindowHandle windowhandle);
+
+/**
  * Enable IME attached to the given window, i.e. allows user-input
  * events to be dispatched to the IME.
  * \param windowhandle Window handle of the caller
diff --git a/intern/ghost/GHOST_IWindow.h b/intern/ghost/GHOST_IWindow.h
index 688ebecf557..4a4d6be5ced 100644
--- a/intern/ghost/GHOST_IWindow.h
+++ b/intern/ghost/GHOST_IWindow.h
@@ -332,6 +332,12 @@ public:
 
 	virtual float getNativePixelSize(void) = 0;
 
+	/**
+	 * Returns the recommended DPI for this window.
+	 * \return The recommended DPI for this window.
+	 */
+	virtual GHOST_TUns16 getDPIHint() = 0;
+
 #ifdef WITH_INPUT_IME
 	/**
 	 * Enable IME attached to the given window, i.e. allows user-input
diff --git a/intern/ghost/GHOST_Types.h b/intern/ghost/GHOST_Types.h
index 9ee4599a4a6..02b5063e515 100644
--- a/intern/ghost/GHOST_Types.h
+++ b/intern/ghost/GHOST_Types.h
@@ -189,6 +189,7 @@ typedef enum {
 	GHOST_kEventWindowUpdate,
 	GHOST_kEventWindowSize,
 	GHOST_kEventWindowMove,
+	GHOST_kEventWindowDPIHintChanged,
 	
 	GHOST_kEventDraggingEntered,
 	GHOST_kEventDraggingUpdated,
diff --git a/intern/ghost/intern/GHOST_C-api.cpp b/intern/ghost/intern/GHOST_C-api.cpp
index 41bc735e1e2..ce653188760 100644
--- a/intern/ghost/intern/GHOST_C-api.cpp
+++ b/intern/ghost/intern/GHOST_C-api.cpp
@@ -914,6 +914,12 @@ float GHOST_GetNativePixelSize(GHOST_WindowHandle windowhandle)
 	return 1.0f;
 }
 
+GHOST_TUns16 GHOST_GetDPIHint(GHOST_WindowHandle windowhandle)
+{
+	GHOST_IWindow *window = (GHOST_IWindow *)windowhandle;
+	return window->getDPIHint();
+}
+
 #ifdef WITH_INPUT_IME
 
 void GHOST_BeginIME(GHOST_WindowHandle windowhandle,
diff --git a/intern/ghost/intern/GHOST_SystemWin32.cpp b/intern/ghost/intern/GHOST_SystemWin32.cpp
index 7d55a973f91..240d7ccd2fe 100644
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -111,6 +111,11 @@
 #define VK_MEDIA_PLAY_PAUSE 0xB3
 #endif // VK_MEDIA_PLAY_PAUSE
 
+// Window message newer than Windows 7
+#ifndef WM_DPICHANGED
+#define WM_DPICHANGED 0x02E0
+#endif // WM_DPICHANGED
+
 /* Workaround for some laptop touchpads, some of which seems to
  * have driver issues which makes it so window function receives
  * the message, but PeekMessage doesn't pick those messages for
@@ -152,6 +157,27 @@ static void initRawInput()
 #undef DEVICE_COUNT
 }
 
+#ifndef DPI_ENUMS_DECLARED
+typedef enum PROCESS_DPI_AWARENESS {
+	PROCESS_DPI_UNAWARE = 0,
+	PROCESS_SYSTEM_DPI_AWARE = 1,
+	PROCESS_PER_MONITOR_DPI_AWARE = 2
+} PROCESS_DPI_AWARENESS;
+
+typedef enum MONITOR_DPI_TYPE {
+	MDT_EFFECTIVE_DPI = 0,
+	MDT_ANGULAR_DPI = 1,
+	MDT_RAW_DPI = 2,
+	MDT_DEFAULT = MDT_EFFECTIVE_DPI
+} MONITOR_DPI_TYPE;
+
+#define USER_DEFAULT_SCREEN_DPI 96
+
+#define DPI_ENUMS_DECLARED
+#endif
+typedef HRESULT(API * GHOST_WIN32_SetProcessDpiAwareness)(PROCESS_DPI_AWARENESS);
+typedef BOOL(API * GHOST_WIN32_EnableNonClientDpiScaling)(HWND);
+
 GHOST_SystemWin32::GHOST_SystemWin32()
 	: m_hasPerformanceCounter(false), m_freq(0), m_start(0)
 {
@@ -161,6 +187,18 @@ GHOST_SystemWin32::GHOST_SystemWin32()
 
 	m_consoleStatus = 1;
 
+	// Tell Windows we are per monitor DPI aware. This disables the default
+	// blurry scaling and enables WM_DPICHANGED to allow us to draw at proper DPI.
+	HMODULE m_shcore = ::LoadLibrary("Shcore.dll");
+	if (m_shcore) {
+		GHOST_WIN32_SetProcessDpiAwareness fpSetProcessDpiAwareness =
+			(GHOST_WIN32_SetProcessDpiAwareness) ::GetProcAddress(m_shcore, "SetProcessDpiAwareness");
+
+		if (fpSetProcessDpiAwareness) {
+			fpSetProcessDpiAwareness(PROCESS_PER_MONITOR_DPI_AWARE);
+		}
+	}
+
 	// Check if current keyboard layout uses AltGr and save keylayout ID for
 	// specialized handling if keys like VK_OEM_*. I.e. french keylayout
 	// generates VK_OEM_8 for their exclamation key (key left of right shift)
@@ -922,6 +960,20 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
 	GHOST_ASSERT(system, "GHOST_SystemWin32::s_wndProc(): system not initialized");
 
 	if (hwnd) {
+		if(msg == WM_NCCREATE) {
+			// Tell Windows to automatically handle scaling of non-client areas
+			// such as the caption bar. EnableNonClientDpiScaling was introduced in Windows 10
+			HMODULE m_user32 = ::LoadLibrary("User32.dll");
+			if (m_user32) {
+				GHOST_WIN32_EnableNonClientDpiScaling fpEnableNonClientDpiScaling =
+					(GHOST_WIN32_EnableNonClientDpiScaling) ::GetProcAddress(m_user32, "EnableNonClientDpiScaling");
+
+				if (fpEnableNonClientDpiScaling) {
+					fpEnableNonClientDpiScaling(hwnd);
+				}
+			}
+		}
+
 		GHOST_WindowWin32 *window = (GHOST_WindowWin32 *)::GetWindowLongPtr(hwnd, GWLP_USERDATA);
 		if (window) {
 			switch (msg) {
@@ -1294,6 +1346,32 @@ LRESULT WINAPI GHOST_SystemWin32::s_wndProc(HWND hwnd, UINT msg, WPARAM wParam,
 					}
 
 					break;
+				case WM_DPICHANGED:
+					/* The WM_DPICHANGED message is sent when the effective dots per inch (dpi) for a window has changed.
+					* The DPI is the scale factor for a window. There are multiple events that can cause the DPI to
+					* change such as when the window is moved to a monitor with a different DPI.
+					*/
+					{
+						WORD newYAxisDPI = HIWORD(wParam);
+						WORD newXAxisDPI = LOWORD(wParam);
+						// The suggested new size and position of the window.
+						RECT* const suggestedWindowRect = (RECT*)lParam;
+
+						// Push DPI change event first
+						system->pushEvent(processWindowEvent(GHOST_kEventWindowDPIHintChanged, window));
+						system->dispatchEvents();
+						eventHandled = true;
+
+						// Then move and resize window
+						SetWindowPos(hwnd,
+							NULL,
+							suggestedWindowRect->left,
+							suggestedWindowRect->top,
+							suggestedWindowRect->right - suggestedWindowRect->left,
+							suggestedWindowRect->bottom - suggestedWindowRect->top,
+							SWP_NOZORDER | SWP_NOACTIVATE);
+					}
+					break;
 				////////////////////////////////////////////////////////////////////////
 				// Window events, ignored
 				////////////////////////////////////////////////////////////////////////
diff --git a/intern/ghost/intern/GHOST_Window.h b/intern/ghost/intern/GHOST_Window.h
index d778628ea37..2798bdf72f3 100644
--- a/intern/ghost/intern/GHOST_Window.h
+++ b/intern/ghost/intern/GHOST_Window.h
@@ -295,6 +295,15 @@ public:
 		return 1.0f;
 	}
 
+	/**
+	* Returns the recommended DPI for this window.
+	* \return The recommended DPI for this window.
+	*/
+	virtual inline GHOST_TUns16 getDPIHint()
+	{
+		return 96;
+	}
+
 #ifdef WITH_INPUT_IME
 	virtual void beginIME(GHOST_TInt32 x,
 	                      GHOST_TInt32 y,
diff --git a/intern/ghost/intern/GHOST_WindowSDL.cpp b/intern/ghost/intern/GHOST_WindowSDL.cpp
index 1335c38d977..aeb6188daef 100644
--- a/intern/ghost/intern/GHOST_WindowSDL.cpp
+++ b/intern/ghost/intern/GHOST_WindowSDL.cpp
@@ -563,3 +563,19 @@ GHOST_WindowSDL::setWindowCursorVisibility(bool visible)
 	SDL_ShowCursor(visible);
 	return GHOST_kSuccess;
 }
+
+GHOST_TUns16
+GHOST_WindowSDL::getDPIHint()
+{
+	int displayIndex = SDL_GetWindowDisplayIndex(m_sdl_win);
+	if (displayIndex < 0) {
+		return 96;
+	}
+
+	float ddpi;
+	if (SDL_GetDisplayDPI(displayIndex, &ddpi, NULL, NULL) != 0) {
+		return 96;
+	}
+
+	return (int)ddpi;
+}
diff --git a/intern/ghost/intern/GHOST_WindowSDL.h b/intern/ghost/intern/GHOST_WindowSDL.h
index 5f658e8ad01..96104ec28b4 100644
--- a/intern/ghost/intern/GHOST_WindowSDL.h
+++ b/intern/ghost/intern/GHOST_WindowSDL.h
@@ -168,6 +168,8 @@ protected:
 	GHOST_TSuccess beginFullScreen() const { return GHOST_kFailure; }
 
 	GHOST_TSuccess endFullScreen() const { return GHOST_kFailure; }
+
+	GHOST_TUns16 getDPIHint();
 };
 
 
diff --git a/intern/ghost/intern/GHOST_WindowWin32.cpp b/intern/ghost/intern/GHOST_WindowWin32.cpp
index 7d80aa43a40..fc46164c135 100644
--- a/intern/ghost/intern/GHOST_WindowWin32.cpp
+++ b/intern/ghost/intern/GHOST_WindowWin32.cpp
@@ -92,6 +92,7 @@ GHOST_WindowWin32::GHOST_WindowWin32(GHOST_SystemWin32 *system,
       m_tablet(0),
       m_maxPressure(0),
       m_normal_state(GHOST_kWindowStateNormal),
+	  m_user32(NULL),
       m_parentWindowHwnd(parentwindowhwnd),
       m_debug_context(is_debug)
 {
@@ -965,6 +966,23 @@ void GHOST_WindowWin32::bringTabletContextToFront()
 	}
 }
 
+GHOST_TUns16 GHOST_WindowWin32::getDPIHint()
+{
+	if (!m_user32) {
+		m_user32 = ::LoadLibrary("user32.dll");
+	}
+
+	if (m_user32) {
+		GHOST_WIN32_GetDpiForWindow fpGetDpiForWindow = (GHOST_WIN32_GetDpiForWindow) ::GetProcAddress(m_user32, "GetDpiForWindow");
+
+		if (fpGetDpiForWindow) {
+			return fpGetDpiForWindow(this->m_hWnd);
+		}
+	}
+
+	return USER_DEFAULT_SCREEN_DPI;
+}
+
 /** Reverse the bits in a GHOST_TUns8 */
 static GHOST_TUns8 uns8ReverseBits(GHOST_TUns8 ch)
 {
diff --git a/intern/ghost/intern/GHOST_WindowWin32.h b/intern/ghost/intern/GHOST_WindowWin32.h
index a1cf58c9ceb..75a33951ff4 100644
--- a/intern/ghost/intern/GHOST_WindowWin32.h
+++ b/intern/ghost/intern/GHOST_WindowWin32.h
@@ -58,6 +58,12 @@ typedef BOOL (API * GHOST_WIN32_WTClose)(HCTX);
 typedef BOOL (API * GHOST_WIN32_WTPacket)(HCTX, UINT, LPVOID);
 typedef BOOL (API * GHOST_WIN32_WTOverlap)(HCTX, BOOL);
 
+// typedefs for user32 functions to allow dynamic loading of Windows 10 DPI scaling functions
+typedef UINT(API * GHOST_WIN32_GetDpiForWindow)(HWND);
+#ifndef USER_DEFAULT_SCREEN_DPI
+#define USER_DEFAULT_SCREEN_DPI 96
+#endif // USER_DEFAULT_SCREEN_DPI
+
 /**
  * GHOST window on M$ Windows OSs.
  * \author	Maarten Gribnau
@@ -251,6 +257,8 @@ public:
 
 	GHOST_TSuccess endFullScreen() const {return GHOST_kFailure;}
 
+	GHOST_TUns16 getDPIHint() override;
+
 	/** if the window currently resizing */
 	bool m_inLiveResize;
 
@@ -351,6 +359,9 @@ private:
 
 	GHOST_TWindowState m_normal_state;
 
+	/** user32 dll handle*/
+	HMODULE m_user32;
+
 	/** Hwnd to parent window */
 	GHOST_TEmbedderWindowID m_parentWindowHwnd;
 
diff --git a/intern/ghost/intern/GHOST_WindowX11.cpp b/intern/ghost/intern/GHOST_WindowX11.cpp
index 47fbe1256b1..2019f58251f 100644
--- a/intern/ghost/intern/GHOST_WindowX11.cpp
+++ b/intern/ghost/intern/GHOST_WindowX11.cpp
@@ -56,6 +56,9 @@
 #  include <X11/extensions/XInput2.h>
 #endif
 
+//For DPI value
+#include <X11/Xresource.h>
+
 #if defined(__sun__) || defined(__sun) || defined(__sparc) || defined(__sparc__) || defined(_AIX)
 #  include <strings.h>
 #endif
@@ -68,6 +71,7 @@
 
 #include <algorithm>
 #include <string>
+#include <math.h>
 
 /* For obscure full screen mode stuff
  * lifted verbatim from blut. */
@@ -1672,3 +1676,44 @@ endFullScreen() const
 
 	return GHOST_kSuccess;
 }
+
+GHOST_TUns16
+GHOST_WindowX11::
+getDPIHint()
+{
+	/* Try to read DPI setting set using xrdb */
+	char* resMan = XResourceManagerString(m_display);
+	if (resMan) {
+		XrmDatabase xrdb = XrmGetStringDatabase(resMan);
+		if (xrdb) {
+			char* type = NULL;
+			XrmValue val;
+
+			int success = XrmGetResource(xrdb, "Xft.dpi", "Xft.Dpi", &type, &val);
+			if (success && type) {
+				if (strcmp(type, "String") == 0) {
+					return atoi((char*)val.addr);
+				}
+			}
+		}
+	}
+
+	/* Fallback to calculating DPI using X reported DPI, set using xrandr --dpi */
+	XWindowAttributes attr;
+	if (!XGetWindowAttributes(m_display, m_window, &attr)) {
+		/* Failed to get window attributes, return X11 default DPI */
+		return 96;
+	}
+
+	Screen* screen = attr.screen;
+	int pixelWidth = WidthOfScreen(screen);
+	int pixelHeight = HeightOfScreen(screen);
+	int mmWidth = WidthMMOfScreen(screen);
+	int mmHeight = HeightMMOfScreen(screen);
+
+	double pixelDiagonal = sqrt((pixelWidth * pixelWidth) + (pixelHeight * pixelHeight));
+	double mmDiagonal = sqrt((mmWidth * mmWidth) + (mmHeight * mmHeight));
+	float inchDiagonal = mmDiagonal * 0.039f;
+	int dpi = pixelDiagonal / inchDiagonal;
+	return dpi;
+}
diff --git a/intern/ghost/intern/GHOST_WindowX11.h b/intern/ghost/intern/GHOST_WindowX11.h
index 9380aa9d631..5c54c1e8162 100644
--- a/intern/ghost/intern/GHOST_WindowX11.h
+++ b/intern/ghost/intern/GHOST_WindowX11.h
@@ -235,6 +235,8 @@ public:
 
 	GHOST_TSuccess endFullScreen() const;
 
+	GHOST_TUns16 getDPIHint();
+
 protected:
 	/**
 	 * \param type	The type of rendering context create.
diff --git a/intern/libmv/libmv/numeric/numeric.h b/intern/libmv/libmv/numeric/numeric.h
index a42dab8c7a2..1a23d653676 100644
--- a/intern/libmv/libmv/numeric/numeric.h
+++ b/intern/libmv/libmv/numeric/numeric.h
@@ -36,7 +36,7 @@
 #if !defined(__MINGW64__)
 #  if defined(_WIN32) || defined(__APPLE__) || \
       defined(__FreeBSD__) || defined(__NetBSD__)
-static void sincos(double x, double *sinx, double *cosx) {
+inline void sincos(double x, double *sinx, double *cosx) {
   *sinx = sin(x);
   *cosx = cos(x);
 }
diff --git a/make.bat b/make.bat
index 337a5517656..dd82b1ca8a7 100644
--- a/make.bat
+++ b/make.bat
@@ -5,8 +5,8 @@ REM This is for users who like to configure & build Blender with a single comman
 setlocal ENABLEEXTENSIONS
 set BLENDER_DIR=%~dp0
 set BLENDER_DIR_NOSPACES=%BLENDER_DIR: =%
-if not "%BLENDER_DIR%"=="%BLENDER_DIR_NOSPACES%" ( 
-	echo There are spaces detected in the build path "%BLENDER_DIR%", this is currently not supported, exiting.... 
+if not "%BLENDER_DIR%"=="%BLENDER_DIR_NOSPACES%" (
+	echo There are spaces detected in the build path "%BLENDER_DIR%", this is currently not supported, exiting....
 	goto EOF
 )
 set BUILD_DIR=%BLENDER_DIR%..\build_windows
@@ -79,7 +79,7 @@ if NOT "%1" == "" (
 	set NOBUILD=1
 	)	else if "%1" == "showhash" (
 		for /f "delims=" %%i in ('git rev-parse HEAD') do echo Branch_hash=%%i
-		cd release/datafiles/locale 
+		cd release/datafiles/locale
 		for /f "delims=" %%i in ('git rev-parse HEAD') do echo Locale_hash=%%i
 		cd %~dp0
 		cd release/scripts/addons
@@ -132,13 +132,13 @@ if "%BUILD_ARCH%"=="x64" (
 
 
 if "%target%"=="Release" (
-		rem for vc12 check for both cuda 7.5 and 8 
+		rem for vc12 check for both cuda 7.5 and 8
 		if "%CUDA_PATH%"=="" (
 			echo Cuda Not found, aborting!
 			goto EOF
 		)
 		set BUILD_CMAKE_ARGS=%BUILD_CMAKE_ARGS% ^
-		-C"%BLENDER_DIR%\build_files\cmake\config\blender_release.cmake" 
+		-C"%BLENDER_DIR%\build_files\cmake\config\blender_release.cmake"
 )
 
 :DetectMSVC
@@ -157,7 +157,7 @@ if DEFINED MSVC_VC_DIR goto msvc_detect_finally
 if DEFINED MSVC_VC_DIR call "%MSVC_VC_DIR%\vcvarsall.bat"
 if DEFINED MSVC_VC_DIR goto sanity_checks
 
-rem MSVC Build environment 2017 and up. 
+rem MSVC Build environment 2017 and up.
 for /F "usebackq skip=2 tokens=1-2*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\SOFTWARE\Wow6432Node\Microsoft\VisualStudio\SXS\VS7" /v %BUILD_VS_VER%.0 2^>nul`) DO set MSVC_VS_DIR=%%C
 if DEFINED MSVC_VS_DIR goto msvc_detect_finally_2017
 REM Check 32 bits
@@ -202,7 +202,7 @@ if NOT EXIST %BLENDER_DIR%..\lib\nul (
 if "%TARGET%"=="" (
 	echo Error: Convenience target not set
 	echo This is required for building, aborting!
-	echo . 
+	echo .
 	goto HELP
 )
 
@@ -266,15 +266,15 @@ echo.
 echo At any point you can optionally modify your build configuration by editing:
 echo "%BUILD_DIR%\CMakeCache.txt", then run "make" again to build with the changes applied.
 echo.
-echo Blender successfully built, run from: "%BUILD_DIR%\bin\%BUILD_TYPE%"
+echo Blender successfully built, run from: "%BUILD_DIR%\bin\%BUILD_TYPE%\blender.exe"
 echo.
 goto EOF
 :HELP
 		echo.
 		echo Convenience targets
-		echo - release ^(identical to the offical blender.org builds^)
+		echo - release ^(identical to the official blender.org builds^)
 		echo - full ^(same as release minus the cuda kernels^)
-		echo - lite 
+		echo - lite
 		echo - headless
 		echo - cycles
 		echo - bpy
@@ -289,11 +289,10 @@ goto EOF
 		echo - with_tests ^(enable building unit tests^)
 		echo - debug ^(Build an unoptimized debuggable build^)
 		echo - packagename [newname] ^(override default cpack package name^)
-		echo - x86 ^(override host autodetect and build 32 bit code^)
-		echo - x64 ^(override host autodetect and build 64 bit code^)
+		echo - x86 ^(override host auto-detect and build 32 bit code^)
+		echo - x64 ^(override host auto-detect and build 64 bit code^)
 		echo - 2013 ^(build with visual studio 2013^)
 		echo - 2015 ^(build with visual studio 2015^) [EXPERIMENTAL]
 		echo.
 
 :EOF
-
diff --git a/release/datafiles/blender_icons.svg b/release/datafiles/blender_icons.svg
index e9c114ba1bd..d88788fa904 100644
--- a/release/datafiles/blender_icons.svg
+++ b/release/datafiles/blender_icons.svg
@@ -14,7 +14,7 @@
    height="640"
    id="svg2"
    sodipodi:version="0.32"
-   inkscape:version="0.91 r"
+   inkscape:version="0.91 r13725"
    version="1.0"
    sodipodi:docname="blender_icons.svg"
    inkscape:output_extension="org.inkscape.output.svg.inkscape"
@@ -31338,6 +31338,26 @@
          d="m 125.5,433.5 23,0 0,41 -33,0 0,-31 10,-10 z"
          style="display:inline;fill:url(#linearGradient13110);fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1;marker:none" />
     </clipPath>
+    <linearGradient
+       inkscape:collect="always"
+       xlink:href="#linearGradient1610-6"
+       id="linearGradient18199"
+       gradientUnits="userSpaceOnUse"
+       x1="189.76083"
+       y1="248.13905"
+       x2="116.05637"
+       y2="183.6826" />
+    <radialGradient
+       inkscape:collect="always"
+       xlink:href="#linearGradient22562"
+       id="radialGradient23167-6"
+       gradientUnits="userSpaceOnUse"
+       gradientTransform="matrix(0.99220964,-0.12457927,0.11585516,0.92272644,-34.13325,22.766225)"
+       cx="-0.78262758"
+       cy="294.63174"
+       fx="-0.78262758"
+       fy="294.63174"
+       r="6.6750002" />
   </defs>
   <sodipodi:namedview
      id="base"
@@ -31349,16 +31369,16 @@
      objecttolerance="10000"
      inkscape:pageopacity="0.0"
      inkscape:pageshadow="2"
-     inkscape:zoom="14.413868"
-     inkscape:cx="480.24726"
-     inkscape:cy="269.95478"
+     inkscape:zoom="19.997864"
+     inkscape:cx="462.52244"
+     inkscape:cy="435.14241"
      inkscape:document-units="px"
-     inkscape:current-layer="layer1"
+     inkscape:current-layer="g23149-4"
      showgrid="true"
-     inkscape:window-width="1680"
-     inkscape:window-height="1020"
-     inkscape:window-x="0"
-     inkscape:window-y="30"
+     inkscape:window-width="1920"
+     inkscape:window-height="1025"
+     inkscape:window-x="-8"
+     inkscape:window-y="-8"
      inkscape:snap-nodes="true"
      inkscape:snap-bbox="true"
      showguides="true"
@@ -92660,6 +92680,56 @@
            style="opacity:0.51999996;fill:url(#radialGradient21448-8-143);fill-opacity:1;fill-rule:evenodd;stroke:none" />
       </g>
     </g>
+    <g
+       transform="translate(335.99871,21.048284)"
+       style="display:inline;enable-background:new"
+       id="ICON_ROTATE-7">
+      <rect
+         y="178"
+         x="110"
+         height="16"
+         width="16"
+         id="rect37989-8"
+         style="display:inline;overflow:visible;visibility:visible;opacity:0;fill:#b3b3b3;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:2.79999995;marker:none;enable-background:accumulate" />
+      <path
+         style="display:inline;overflow:visible;visibility:visible;fill:none;stroke:#000000;stroke-width:2.4000001;stroke-linecap:square;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker:none;enable-background:new"
+         d="m 114.5,192.5 -3,0 0,-3 m 13,0 0,3 -3,0 m -0.25,-13 3.25,0 0,3 m -13,0 0,-3 3,0"
+         id="path37498-7"
+         sodipodi:nodetypes="cccccccccccc"
+         inkscape:connector-curvature="0" />
+      <path
+         sodipodi:nodetypes="cccccccccccc"
+         id="rect38140-7"
+         d="m 114.5,192.5 -3,0 0,-3 m 13,0 0,3 -3,0 m -0.25,-13 3.25,0 0,3 m -13,0 0,-3 3,0"
+         style="display:inline;overflow:visible;visibility:visible;fill:none;stroke:url(#linearGradient18199);stroke-width:1;stroke-linecap:square;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dashoffset:0;stroke-opacity:1;marker:none;enable-background:new"
+         inkscape:connector-curvature="0" />
+      <g
+         transform="matrix(0.59971056,0,0,0.59971056,116.78278,9.7425599)"
+         style="display:inline;enable-background:new"
+         id="g23145-9">
+        <g
+           id="g23149-4">
+          <path
+             id="path39832-9"
+             style="display:inline;overflow:visible;visibility:visible;fill:none;stroke:#1a1a1a;stroke-width:4.66725159;stroke-linecap:square;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker:none;enable-background:accumulate"
+             d="m -4.3682386,287.81345 1.5,0 c 0.999089,0 2.07885534,1.30514 2.50490386,2.78207 1.06592652,3.69512 2.80867074,9.82446 5.88525404,9.96406 2.6782554,0 1.6181317,-5.11535 3.1736046,-5.26275 l 0.25,0"
+             sodipodi:nodetypes="cssccc"
+             inkscape:connector-curvature="0" />
+          <path
+             sodipodi:nodetypes="ccscc"
+             d="m 9.3647983,295.22328 -0.4793018,0 c -2.2335161,0 0.1796731,4.94901 -3.4398065,5.09984 -4.44796752,0.18536 -5.37272213,-12.59185 -8.0767581,-12.56237 l -2,0"
+             style="display:inline;overflow:visible;visibility:visible;fill:none;stroke:#a8df84;stroke-width:2.93474906;stroke-linecap:square;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker:none;enable-background:accumulate"
+             id="path39834-2"
+             inkscape:connector-curvature="0" />
+        </g>
+        <path
+           id="path39836-9"
+           style="display:inline;overflow:visible;visibility:visible;opacity:0.35;fill:none;stroke:url(#radialGradient23167-6);stroke-width:3.53503864;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;marker:none;enable-background:accumulate"
+           d="M 5.6770841,300.48165 C 0.7393262,300.21066 0.54777814,287.99792 -2.9522219,287.99792"
+           sodipodi:nodetypes="cc"
+           inkscape:connector-curvature="0" />
+      </g>
+    </g>
   </g>
   <g
      inkscape:groupmode="layer"
diff --git a/release/datafiles/blender_icons16/icon16_normalize_fcurves.dat b/release/datafiles/blender_icons16/icon16_normalize_fcurves.dat
new file mode 100644
index 00000000000..fe118a23d79
--- /dev/null
+++ b/release/datafiles/blender_icons16/icon16_normalize_fcurves.dat
diff --git a/release/datafiles/blender_icons32/icon32_normalize_fcurves.dat b/release/datafiles/blender_icons32/icon32_normalize_fcurves.dat
new file mode 100644
index 00000000000..5b1f546a563
--- /dev/null
+++ b/release/datafiles/blender_icons32/icon32_normalize_fcurves.dat
diff --git a/release/scripts/freestyle/modules/freestyle/shaders.py b/release/scripts/freestyle/modules/freestyle/shaders.py
index 633def38b5b..bce6642220b 100644
--- a/release/scripts/freestyle/modules/freestyle/shaders.py
+++ b/release/scripts/freestyle/modules/freestyle/shaders.py
@@ -568,7 +568,7 @@ class pyRandomColorShader(StrokeShader):
 
 class py2DCurvatureColorShader(StrokeShader):
     """
-    Assigns a color (greyscale) to the stroke based on the curvature.
+    Assigns a color (grayscale) to the stroke based on the curvature.
     A higher curvature will yield a brighter color.
     """
     def shade(self, stroke):
@@ -584,7 +584,7 @@ class py2DCurvatureColorShader(StrokeShader):
 
 class pyTimeColorShader(StrokeShader):
     """
-    Assigns a greyscale value that increases for every vertex.
+    Assigns a grayscale value that increases for every vertex.
     The brightness will increase along the stroke.
     """
     def __init__(self, step=0.01):
diff --git a/release/scripts/freestyle/modules/parameter_editor.py b/release/scripts/freestyle/modules/parameter_editor.py
index 93305cb7c5a..b093920a4cb 100644
--- a/release/scripts/freestyle/modules/parameter_editor.py
+++ b/release/scripts/freestyle/modules/parameter_editor.py
@@ -1170,6 +1170,7 @@ class Seed:
 
 _seed = Seed()
 
+
 def get_dashed_pattern(linestyle):
     """Extracts the dashed pattern from the various UI options """
     pattern = []
@@ -1185,6 +1186,15 @@ def get_dashed_pattern(linestyle):
     return pattern
 
 
+def get_grouped_objects(group):
+    for ob in group.objects:
+        if ob.dupli_type == 'GROUP' and ob.dupli_group is not None:
+            for dupli in get_grouped_objects(ob.dupli_group):
+                yield dupli
+        else:
+            yield ob
+
+
 integration_types = {
     'MEAN': IntegrationType.MEAN,
     'MIN': IntegrationType.MIN,
@@ -1267,7 +1277,7 @@ def process(layer_name, lineset_name):
     # prepare selection criteria by group of objects
     if lineset.select_by_group:
         if lineset.group is not None:
-            names = {getQualifiedObjectName(ob): True for ob in lineset.group.objects}
+            names = {getQualifiedObjectName(ob): True for ob in get_grouped_objects(lineset.group)}
             upred = ObjectNamesUP1D(names, lineset.group_negation == 'EXCLUSIVE')
             selection_criteria.append(upred)
     # prepare selection criteria by image border
diff --git a/release/scripts/modules/addon_utils.py b/release/scripts/modules/addon_utils.py
index 0f096f5812c..51e3e65b78c 100644
--- a/release/scripts/modules/addon_utils.py
+++ b/release/scripts/modules/addon_utils.py
@@ -24,6 +24,7 @@ __all__ = (
     "check",
     "enable",
     "disable",
+    "disable_all",
     "reset_all",
     "module_bl_info",
 )
@@ -31,8 +32,9 @@ __all__ = (
 import bpy as _bpy
 _user_preferences = _bpy.context.user_preferences
 
-error_duplicates = False
 error_encoding = False
+# (name, file, path)
+error_duplicates = []
 addons_fake_modules = {}
 
 
@@ -57,12 +59,11 @@ def paths():
 
 
 def modules_refresh(module_cache=addons_fake_modules):
-    global error_duplicates
     global error_encoding
     import os
 
-    error_duplicates = False
     error_encoding = False
+    error_duplicates.clear()
 
     path_list = paths()
 
@@ -168,7 +169,7 @@ def modules_refresh(module_cache=addons_fake_modules):
                 if mod.__file__ != mod_path:
                     print("multiple addons with the same name:\n  %r\n  %r" %
                           (mod.__file__, mod_path))
-                    error_duplicates = True
+                    error_duplicates.append((mod.bl_info["name"], mod.__file__, mod_path))
 
                 elif mod.__time__ != os.path.getmtime(mod_path):
                     print("reloading addon:",
@@ -444,6 +445,13 @@ def reset_all(*, reload_scripts=False):
                 disable(mod_name)
 
 
+def disable_all():
+    import sys
+    for mod_name, mod in sys.modules.items():
+        if getattr(mod, "__addon_enabled__", False):
+            disable(mod_name)
+
+
 def module_bl_info(mod, info_basis=None):
     if info_basis is None:
         info_basis = {
diff --git a/release/scripts/modules/bl_app_override/__init__.py b/release/scripts/modules/bl_app_override/__init__.py
new file mode 100644
index 00000000000..89cc8a0eb28
--- /dev/null
+++ b/release/scripts/modules/bl_app_override/__init__.py
@@ -0,0 +1,202 @@
+# ##### BEGIN GPL LICENSE BLOCK #####
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License
+#  as published by the Free Software Foundation; either version 2
+#  of the License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software Foundation,
+#  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+# ##### END GPL LICENSE BLOCK #####
+
+# <pep8-80 compliant>
+
+"""
+Module to manage overriding various parts of Blender.
+
+Intended for use with 'app_templates', though it can be used from anywhere.
+"""
+
+
+# TODO, how to check these aren't from add-ons.
+# templates might need to un-register while filtering.
+def class_filter(cls_parent, **kw):
+    whitelist = kw.pop("whitelist", None)
+    blacklist = kw.pop("blacklist", None)
+    kw_items = tuple(kw.items())
+    for cls in cls_parent.__subclasses__():
+        # same as is_registered()
+        if "bl_rna" in cls.__dict__:
+            if blacklist is not None and cls.__name__ in blacklist:
+                continue
+            if ((whitelist is not None and cls.__name__ is whitelist) or
+                    all((getattr(cls, attr) in expect) for attr, expect in kw_items)):
+                yield cls
+
+
+def ui_draw_filter_register(
+    *,
+    ui_ignore_classes=None,
+    ui_ignore_operator=None,
+    ui_ignore_property=None,
+    ui_ignore_menu=None,
+    ui_ignore_label=None
+):
+    import bpy
+
+    UILayout = bpy.types.UILayout
+
+    if ui_ignore_classes is None:
+        ui_ignore_classes = (
+            bpy.types.Panel,
+            bpy.types.Menu,
+            bpy.types.Header,
+        )
+
+    class OperatorProperties_Fake:
+        pass
+
+    class UILayout_Fake(bpy.types.UILayout):
+        __slots__ = ()
+
+        def __getattribute__(self, attr):
+            # ensure we always pass down UILayout_Fake instances
+            if attr in {"row", "split", "column", "box", "column_flow"}:
+                real_func = UILayout.__getattribute__(self, attr)
+
+                def dummy_func(*args, **kw):
+                    # print("wrapped", attr)
+                    ret = real_func(*args, **kw)
+                    return UILayout_Fake(ret)
+                return dummy_func
+
+            elif attr in {"operator", "operator_menu_enum", "operator_enum"}:
+                if ui_ignore_operator is None:
+                    return UILayout.__getattribute__(self, attr)
+
+                real_func = UILayout.__getattribute__(self, attr)
+
+                def dummy_func(*args, **kw):
+                    # print("wrapped", attr)
+                    if not ui_ignore_operator(args[0]):
+                        ret = real_func(*args, **kw)
+                    else:
+                        # UILayout.__getattribute__(self, "label")()
+                        # may need to be set
+                        ret = OperatorProperties_Fake()
+                    return ret
+                return dummy_func
+
+            elif attr in {"prop", "prop_enum"}:
+                if ui_ignore_property is None:
+                    return UILayout.__getattribute__(self, attr)
+
+                real_func = UILayout.__getattribute__(self, attr)
+
+                def dummy_func(*args, **kw):
+                    # print("wrapped", attr)
+                    if not ui_ignore_property(args[0].__class__.__name__, args[1]):
+                        ret = real_func(*args, **kw)
+                    else:
+                        ret = None
+                    return ret
+                return dummy_func
+
+            elif attr == "menu":
+                if ui_ignore_menu is None:
+                    return UILayout.__getattribute__(self, attr)
+
+                real_func = UILayout.__getattribute__(self, attr)
+
+                def dummy_func(*args, **kw):
+                    # print("wrapped", attr)
+                    if not ui_ignore_menu(args[0]):
+                        ret = real_func(*args, **kw)
+                    else:
+                        ret = None
+                    return ret
+                return dummy_func
+
+            elif attr == "label":
+                if ui_ignore_label is None:
+                    return UILayout.__getattribute__(self, attr)
+
+                real_func = UILayout.__getattribute__(self, attr)
+
+                def dummy_func(*args, **kw):
+                    # print("wrapped", attr)
+                    if not ui_ignore_label(args[0] if args else kw.get("text", "")):
+                        ret = real_func(*args, **kw)
+                    else:
+                        # ret = real_func()
+                        ret = None
+                    return ret
+                return dummy_func
+            else:
+                return UILayout.__getattribute__(self, attr)
+            # print(self, attr)
+
+        def operator(*args, **kw):
+            return super().operator(*args, **kw)
+
+    def draw_override(func_orig, self_real, context):
+        cls_real = self_real.__class__
+        if cls_real is super:
+            # simple, no wrapping
+            return func_orig(self_real, context)
+
+        class Wrapper(cls_real):
+            __slots__ = ()
+            def __getattribute__(self, attr):
+                if attr == "layout":
+                    return UILayout_Fake(self_real.layout)
+                else:
+                    cls = super()
+                    try:
+                        return cls.__getattr__(self, attr)
+                    except AttributeError:
+                        # class variable
+                        try:
+                            return getattr(cls, attr)
+                        except AttributeError:
+                            # for preset bl_idname access
+                            return getattr(UILayout(self), attr)
+
+            @property
+            def layout(self):
+                # print("wrapped")
+                return self_real.layout
+
+        return func_orig(Wrapper(self_real), context)
+
+    ui_ignore_store = []
+
+    for cls in ui_ignore_classes:
+        for subcls in list(cls.__subclasses__()):
+            if "draw" in subcls.__dict__:  # don't want to get parents draw()
+
+                def replace_draw():
+                    # function also serves to hold draw_old in a local name-space
+                    draw_orig = subcls.draw
+
+                    def draw(self, context):
+                        return draw_override(draw_orig, self, context)
+                    subcls.draw = draw
+
+                ui_ignore_store.append((subcls, "draw", subcls.draw))
+
+                replace_draw()
+
+    return ui_ignore_store
+
+
+def ui_draw_filter_unregister(ui_ignore_store):
+    for (obj, attr, value) in ui_ignore_store:
+        setattr(obj, attr, value)
diff --git a/release/scripts/modules/bl_app_override/helpers.py b/release/scripts/modules/bl_app_override/helpers.py
new file mode 100644
index 00000000000..981039e8ddc
--- /dev/null
+++ b/release/scripts/modules/bl_app_override/helpers.py
@@ -0,0 +1,167 @@
+# ##### BEGIN GPL LICENSE BLOCK #####
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License
+#  as published by the Free Software Foundation; either version 2
+#  of the License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software Foundation,
+#  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+# ##### END GPL LICENSE BLOCK #####
+
+# <pep8-80 compliant>
+
+# -----------------------------------------------------------------------------
+# AppOverrideState
+
+
+class AppOverrideState:
+    """
+    Utility class to encapsulate overriding the application state
+    so that settings can be restored afterwards.
+    """
+    __slots__ = (
+        # setup_classes
+        "_class_store",
+        # setup_ui_ignore
+        "_ui_ignore_store",
+        # setup_addons
+        "_addon_store",
+    )
+
+    # ---------
+    # Callbacks
+    #
+    # Set as None, to make it simple to check if they're being overridden.
+
+    # setup/teardown classes
+    class_ignore = None
+
+    # setup/teardown ui_ignore
+    ui_ignore_classes = None
+    ui_ignore_operator = None
+    ui_ignore_property = None
+    ui_ignore_menu = None
+    ui_ignore_label = None
+
+    addon_paths = None
+    addons = None
+
+    # End callbacks
+
+    def __init__(self):
+        self._class_store = None
+        self._addon_store = None
+        self._ui_ignore_store = None
+
+    def _setup_classes(self):
+        import bpy
+        assert(self._class_store is None)
+        self._class_store = self.class_ignore()
+        from bpy.utils import unregister_class
+        for cls in self._class_store:
+            unregister_class(cls)
+
+    def _teardown_classes(self):
+        assert(self._class_store is not None)
+
+        from bpy.utils import register_class
+        for cls in self._class_store:
+            register_class(cls)
+        self._class_store = None
+
+    def _setup_ui_ignore(self):
+        import bl_app_override
+
+        self._ui_ignore_store = bl_app_override.ui_draw_filter_register(
+            ui_ignore_classes=(
+                None if self.ui_ignore_classes is None
+                else self.ui_ignore_classes()
+            ),
+            ui_ignore_operator=self.ui_ignore_operator,
+            ui_ignore_property=self.ui_ignore_property,
+            ui_ignore_menu=self.ui_ignore_menu,
+            ui_ignore_label=self.ui_ignore_label,
+        )
+
+    def _teardown_ui_ignore(self):
+        import bl_app_override
+        bl_app_override.ui_draw_filter_unregister(
+            self._ui_ignore_store
+        )
+        self._ui_ignore_store = None
+
+    def _setup_addons(self):
+        import sys
+        import os
+
+        sys_path = []
+        if self.addon_paths is not None:
+            for path in self.addon_paths():
+                if path not in sys.path:
+                    sys.path.append(path)
+
+        import addon_utils
+        addons = []
+        if self.addons is not None:
+            addons.extend(self.addons())
+            for addon in addons:
+                addon_utils.enable(addon)
+
+        self._addon_store = {
+            "sys_path": sys_path,
+            "addons": addons,
+        }
+
+    def _teardown_addons(self):
+        import sys
+
+        sys_path = self._addon_store["sys_path"]
+        for path in sys_path:
+            # should always succeed, but if not it doesn't matter
+            # (someone else was changing the sys.path), ignore!
+            try:
+                sys.path.remove(path)
+            except:
+                pass
+
+        addons = self._addon_store["addons"]
+        import addon_utils
+        for addon in addons:
+            addon_utils.disable(addon)
+
+        self._addon_store.clear()
+        self._addon_store = None
+
+    def setup(self):
+        if self.class_ignore is not None:
+            self._setup_classes()
+
+        if any((self.addon_paths,
+                self.addons,
+                )):
+            self._setup_addons()
+
+        if any((self.ui_ignore_operator,
+                self.ui_ignore_property,
+                self.ui_ignore_menu,
+                self.ui_ignore_label,
+                )):
+            self._setup_ui_ignore()
+
+    def teardown(self):
+        if self._class_store is not None:
+            self._teardown_classes()
+
+        if self._addon_store is not None:
+            self._teardown_addons()
+
+        if self._ui_ignore_store is not None:
+            self._teardown_ui_ignore()
diff --git a/release/scripts/modules/bl_app_template_utils.py b/release/scripts/modules/bl_app_template_utils.py
new file mode 100644
index 00000000000..b3a4824aa7b
--- /dev/null
+++ b/release/scripts/modules/bl_app_template_utils.py
@@ -0,0 +1,198 @@
+# ##### BEGIN GPL LICENSE BLOCK #####
+#
+#  This program is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU General Public License
+#  as published by the Free Software Foundation; either version 2
+#  of the License, or (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software Foundation,
+#  Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+# ##### END GPL LICENSE BLOCK #####
+
+# <pep8-80 compliant>
+
+"""
+Similar to ``addon_utils``, except we can only have one active at a time.
+
+In most cases users of this module will simply call 'activate'.
+"""
+
+__all__ = (
+    "activate",
+    "import_from_path",
+    "import_from_id",
+    "reset",
+)
+
+import bpy as _bpy
+
+# Normally matches 'user_preferences.app_template_id',
+# but loading new preferences will get us out of sync.
+_app_template = {
+    "id": "",
+}
+
+# instead of sys.modules
+# note that we only ever have one template enabled at a time
+# so it may not seem necessary to use this.
+#
+# However, templates may want to share between each-other,
+# so any loaded modules are stored here?
+#
+# Note that the ID here is the app_template_id , not the modules __name__.
+_modules = {}
+
+
+def _enable(template_id, *, handle_error=None, ignore_not_found=False):
+    import os
+    import sys
+    from bpy_restrict_state import RestrictBlend
+
+    if handle_error is None:
+        def handle_error(ex):
+            import traceback
+            traceback.print_exc()
+
+    # Split registering up into 2 steps so we can undo
+    # if it fails par way through.
+
+    # disable the context, using the context at all is
+    # really bad while loading an template, don't do it!
+    with RestrictBlend():
+
+        # 1) try import
+        try:
+            mod = import_from_id(template_id, ignore_not_found=ignore_not_found)
+            if mod is None:
+                return None
+            mod.__template_enabled__ = False
+            _modules[template_id] = mod
+        except Exception as ex:
+            handle_error(ex)
+            return None
+
+        # 2) try run the modules register function
+        try:
+            mod.register()
+        except Exception as ex:
+            print("Exception in module register(): %r" %
+                  getattr(mod, "__file__", template_id))
+            handle_error(ex)
+            del _modules[template_id]
+            return None
+
+    # * OK loaded successfully! *
+    mod.__template_enabled__ = True
+
+    if _bpy.app.debug_python:
+        print("\tapp_template_utils.enable", mod.__name__)
+
+    return mod
+
+
+def _disable(template_id, *, handle_error=None):
+    """
+    Disables a template by name.
+
+    :arg template_id: The name of the template and module.
+    :type template_id: string
+    :arg handle_error: Called in the case of an error,
+       taking an exception argument.
+    :type handle_error: function
+    """
+    import sys
+
+    if handle_error is None:
+        def handle_error(ex):
+            import traceback
+            traceback.print_exc()
+
+    mod = _modules.get(template_id)
+
+    if mod and getattr(mod, "__template_enabled__", False) is not False:
+        mod.__template_enabled__ = False
+
+        try:
+            mod.unregister()
+        except Exception as ex:
+            print("Exception in module unregister(): %r" %
+                  getattr(mod, "__file__", template_id))
+            handle_error(ex)
+    else:
+        print("\tapp_template_utils.disable: %s not %s." %
+              (template_id, "disabled" if mod is None else "loaded"))
+
+    if _bpy.app.debug_python:
+        print("\tapp_template_utils.disable", template_id)
+
+
+def import_from_path(path, ignore_not_found=False):
+    import os
+    from importlib import import_module
+    base_module, template_id = path.rsplit(os.sep, 2)[-2:]
+    module_name = base_module + "." + template_id
+
+    try:
+        return import_module(module_name)
+    except ModuleNotFoundError as ex:
+        if ignore_not_found and ex.name == module_name:
+            return None
+        raise ex
+
+
+def import_from_id(template_id, ignore_not_found=False):
+    import os
+    path = next(iter(_bpy.utils.app_template_paths(template_id)), None)
+    if path is None:
+        if ignore_not_found:
+            return None
+        else:
+            raise Exception("%r template not found!" % template_id)
+    else:
+        if ignore_not_found:
+            if not os.path.exists(os.path.join(path, "__init__.py")):
+                return None
+        return import_from_path(path, ignore_not_found=ignore_not_found)
+
+
+def activate(template_id=None):
+    template_id_prev = _app_template["id"]
+
+    # not needed but may as well avoid activating same template
+    # ... in fact keep this, it will show errors early on!
+    """
+    if template_id_prev == template_id:
+        return
+    """
+
+    if template_id_prev:
+        _disable(template_id_prev)
+
+    # Disable all addons, afterwards caller must reset.
+    import addon_utils
+    addon_utils.disable_all()
+
+    # ignore_not_found so modules that don't contain scripts don't raise errors
+    mod = _enable(template_id, ignore_not_found=True) if template_id else None
+
+    _app_template["id"] = template_id
+
+
+def reset(*, reload_scripts=False):
+    """
+    Sets default state.
+    """
+    template_id = _bpy.context.user_preferences.app_template
+    if _bpy.app.debug_python:
+        print("bl_app_template_utils.reset('%s')" % template_id)
+
+    # TODO reload_scripts
+
+    activate(template_id)
diff --git a/release/scripts/modules/bpy/__init__.py b/release/scripts/modules/bpy/__init__.py
index 26fdbc8cc56..545b891505f 100644
--- a/release/scripts/modules/bpy/__init__.py
+++ b/release/scripts/modules/bpy/__init__.py
@@ -48,11 +48,11 @@ def main():
     import sys
 
     # Possibly temp. addons path
-    from os.path import join, dirname, normpath
-    sys.path.append(normpath(join(dirname(__file__),
-                                  "..", "..", "addons", "modules")))
-    sys.path.append(join(utils.user_resource('SCRIPTS'),
-                         "addons", "modules"))
+    from os.path import join, dirname
+    sys.path.extend([
+        join(dirname(dirname(dirname(__file__))), "addons", "modules"),
+        join(utils.user_resource('SCRIPTS'), "addons", "modules"),
+    ])
 
     # fake module to allow:
     #   from bpy.types import Panel
diff --git a/release/scripts/modules/bpy/path.py b/release/scripts/modules/bpy/path.py
index a864a86eba7..e17d710068c 100644
--- a/release/scripts/modules/bpy/path.py
+++ b/release/scripts/modules/bpy/path.py
@@ -204,7 +204,9 @@ def display_name(name):
     name = name.replace("_colon_", ":")
     name = name.replace("_plus_", "+")
 
-    name = name.replace("_", " ")
+    # strip to allow underscore prefix
+    # (when paths can't start with numbers for eg).
+    name = name.replace("_", " ").lstrip(" ")
 
     if name.islower():
         name = name.lower().title()
diff --git a/release/scripts/modules/bpy/utils/__init__.py b/release/scripts/modules/bpy/utils/__init__.py
index 31dd836e034..1d555ae7123 100644
--- a/release/scripts/modules/bpy/utils/__init__.py
+++ b/release/scripts/modules/bpy/utils/__init__.py
@@ -32,6 +32,7 @@ __all__ = (
     "preset_find",
     "preset_paths",
     "refresh_script_paths",
+    "app_template_paths",
     "register_class",
     "register_module",
     "register_manual_map",
@@ -49,18 +50,18 @@ __all__ = (
     "unregister_class",
     "unregister_module",
     "user_resource",
-    )
+)
 
 from _bpy import (
-        _utils_units as units,
-        blend_paths,
-        escape_identifier,
-        register_class,
-        resource_path,
-        script_paths as _bpy_script_paths,
-        unregister_class,
-        user_resource as _user_resource,
-        )
+    _utils_units as units,
+    blend_paths,
+    escape_identifier,
+    register_class,
+    resource_path,
+    script_paths as _bpy_script_paths,
+    unregister_class,
+    user_resource as _user_resource,
+)
 
 import bpy as _bpy
 import os as _os
@@ -142,7 +143,7 @@ def load_scripts(reload_scripts=False, refresh_scripts=False):
        as modules.
     :type refresh_scripts: bool
     """
-    use_time = _bpy.app.debug_python
+    use_time = use_class_register_check = _bpy.app.debug_python
 
     if use_time:
         import time
@@ -161,7 +162,8 @@ def load_scripts(reload_scripts=False, refresh_scripts=False):
         for module_name in [ext.module for ext in _user_preferences.addons]:
             _addon_utils.disable(module_name)
 
-        # *AFTER* unregistering all add-ons, otherwise all calls to unregister_module() will silently fail (do nothing).
+        # *AFTER* unregistering all add-ons, otherwise all calls to
+        # unregister_module() will silently fail (do nothing).
         _bpy_types.TypeMap.clear()
 
     def register_module_call(mod):
@@ -245,6 +247,12 @@ def load_scripts(reload_scripts=False, refresh_scripts=False):
                     for mod in modules_from_path(path, loaded_modules):
                         test_register(mod)
 
+    # load template (if set)
+    if any(_bpy.utils.app_template_paths()):
+        import bl_app_template_utils
+        bl_app_template_utils.reset(reload_scripts=reload_scripts)
+        del bl_app_template_utils
+
     # deal with addons separately
     _initialize = getattr(_addon_utils, "_initialize", None)
     if _initialize is not None:
@@ -269,13 +277,21 @@ def load_scripts(reload_scripts=False, refresh_scripts=False):
     if use_time:
         print("Python Script Load Time %.4f" % (time.time() - t_main))
 
+    if use_class_register_check:
+        for cls in _bpy.types.bpy_struct.__subclasses__():
+            if getattr(cls, "is_registered", False):
+                for subcls in cls.__subclasses__():
+                    if not subcls.is_registered:
+                        print(
+                            "Warning, unregistered class: %s(%s)" %
+                            (subcls.__name__, cls.__name__)
+                        )
+
 
 # base scripts
-_scripts = _os.path.join(_os.path.dirname(__file__),
-                         _os.path.pardir,
-                         _os.path.pardir,
-                         )
-_scripts = (_os.path.normpath(_scripts), )
+_scripts = (
+    _os.path.dirname(_os.path.dirname(_os.path.dirname(__file__))),
+)
 
 
 def script_path_user():
@@ -356,6 +372,38 @@ def refresh_script_paths():
             _sys_path_ensure(path)
 
 
+def app_template_paths(subdir=None):
+    """
+    Returns valid application template paths.
+
+    :arg subdir: Optional subdir.
+    :type subdir: string
+    :return: app template paths.
+    :rtype: generator
+    """
+
+    # note: LOCAL, USER, SYSTEM order matches script resolution order.
+    subdir_tuple = (subdir,) if subdir is not None else ()
+
+    path = _os.path.join(*(
+        resource_path('LOCAL'), "scripts", "startup",
+        "bl_app_templates_user", *subdir_tuple))
+    if _os.path.isdir(path):
+        yield path
+    else:
+        path = _os.path.join(*(
+            resource_path('USER'), "scripts", "startup",
+            "bl_app_templates_user", *subdir_tuple))
+        if _os.path.isdir(path):
+            yield path
+
+    path = _os.path.join(*(
+        resource_path('SYSTEM'), "scripts", "startup",
+        "bl_app_templates_system", *subdir_tuple))
+    if _os.path.isdir(path):
+        yield path
+
+
 def preset_paths(subdir):
     """
     Returns a list of paths for a specific preset.
diff --git a/release/scripts/modules/bpy_types.py b/release/scripts/modules/bpy_types.py
index d64acd2ce3b..600b29a6b2b 100644
--- a/release/scripts/modules/bpy_types.py
+++ b/release/scripts/modules/bpy_types.py
@@ -683,6 +683,10 @@ class _GenericUI:
         return draw_funcs
 
     @classmethod
+    def is_extended(cls):
+        return bool(getattr(cls.draw, "_draw_funcs", None))
+
+    @classmethod
     def append(cls, draw_func):
         """
         Append a draw function to this menu,
@@ -725,11 +729,30 @@ class Header(StructRNA, _GenericUI, metaclass=RNAMeta):
 class Menu(StructRNA, _GenericUI, metaclass=RNAMeta):
     __slots__ = ()
 
-    def path_menu(self, searchpaths, operator,
-                  props_default=None, filter_ext=None):
+    def path_menu(self, searchpaths, operator, *,
+                  props_default=None, prop_filepath="filepath",
+                  filter_ext=None, filter_path=None, display_name=None):
+        """
+        Populate a menu from a list of paths.
+
+        :arg searchpaths: Paths to scan.
+        :type searchpaths: sequence of strings.
+        :arg operator: The operator id to use with each file.
+        :type operator: string
+        :arg prop_filepath: Optional operator filepath property (defaults to "filepath").
+        :type prop_filepath: string
+        :arg props_default: Properties to assign to each operator.
+        :type props_default: dict
+        :arg filter_ext: Optional callback that takes the file extensions.
+
+           Returning false excludes the file from the list.
+
+        :type filter_ext: Callable that takes a string and returns a bool.
+        :arg display_name: Optional callback that takes the full path, returns the name to display.
+        :type display_name: Callable that takes a string and returns a string.
+        """
 
         layout = self.layout
-        # hard coded to set the operators 'filepath' to the filename.
 
         import os
         import bpy.utils
@@ -742,25 +765,32 @@ class Menu(StructRNA, _GenericUI, metaclass=RNAMeta):
         # collect paths
         files = []
         for directory in searchpaths:
-            files.extend([(f, os.path.join(directory, f))
-                          for f in os.listdir(directory)
-                          if (not f.startswith("."))
-                          if ((filter_ext is None) or
-                              (filter_ext(os.path.splitext(f)[1])))
-                          ])
+            files.extend(
+                [(f, os.path.join(directory, f))
+                 for f in os.listdir(directory)
+                 if (not f.startswith("."))
+                 if ((filter_ext is None) or
+                     (filter_ext(os.path.splitext(f)[1])))
+                 if ((filter_path is None) or
+                     (filter_path(f)))
+                 ])
 
         files.sort()
 
         for f, filepath in files:
-            props = layout.operator(operator,
-                                    text=bpy.path.display_name(f),
-                                    translate=False)
+            # Intentionally pass the full path to 'display_name' callback,
+            # since the callback may want to use part a directory in the name.
+            props = layout.operator(
+                operator,
+                text=display_name(filepath) if display_name else bpy.path.display_name(f),
+                translate=False,
+            )
 
             if props_default is not None:
                 for attr, value in props_default.items():
                     setattr(props, attr, value)
 
-            props.filepath = filepath
+            setattr(props, prop_filepath, filepath)
             if operator == "script.execute_preset":
                 props.menu_idname = self.bl_idname
 
diff --git a/release/scripts/modules/rna_keymap_ui.py b/release/scripts/modules/rna_keymap_ui.py
index 2ca7a7997a5..a1a4e5b8763 100644
--- a/release/scripts/modules/rna_keymap_ui.py
+++ b/release/scripts/modules/rna_keymap_ui.py
@@ -128,16 +128,15 @@ def draw_kmi(display_keymaps, kc, km, kmi, layout, level):
     else:
         box = col.column()
 
-    split = box.split(percentage=0.01)
+    split = box.split()
 
     # header bar
-    row = split.row()
+    row = split.row(align=True)
     row.prop(kmi, "show_expanded", text="", emboss=False)
-
-    row = split.row()
     row.prop(kmi, "active", text="", emboss=False)
 
     if km.is_modal:
+        row.separator()
         row.prop(kmi, "propvalue", text="")
     else:
         row.label(text=kmi.name)
diff --git a/release/scripts/presets/interface_theme/back_to_black.xml b/release/scripts/presets/interface_theme/back_to_black.xml
index 915e9cb64f1..1636f5b5cf6 100644
--- a/release/scripts/presets/interface_theme/back_to_black.xml
+++ b/release/scripts/presets/interface_theme/back_to_black.xml
@@ -18,7 +18,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_regular>
         <wcol_tool>
@@ -30,19 +30,19 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_tool>
         <wcol_radio>
           <ThemeWidgetColors outline="#2a2a2a"
                              inner="#111111ff"
                              inner_sel="#33406bff"
-                             item="#191919ff"
+                             item="#444444ff"
                              text="#929292"
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_radio>
         <wcol_text>
@@ -50,23 +50,23 @@
                              inner="#111111ff"
                              inner_sel="#33406bff"
                              item="#191919ff"
-                             text="#e4e4e4"
+                             text="#929292"
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_text>
         <wcol_option>
-          <ThemeWidgetColors outline="#2a2a2a"
+          <ThemeWidgetColors outline="#535353"
                              inner="#111111ff"
                              inner_sel="#33406bff"
-                             item="#000000ff"
-                             text="#c7c7c7"
+                             item="#a3a3a3ff"
+                             text="#929292"
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_option>
         <wcol_toggle>
@@ -78,7 +78,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_toggle>
         <wcol_num>
@@ -90,7 +90,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_num>
         <wcol_numslider>
@@ -102,7 +102,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_numslider>
         <wcol_box>
@@ -114,7 +114,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_box>
         <wcol_menu>
@@ -126,7 +126,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_menu>
         <wcol_pulldown>
@@ -138,7 +138,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_pulldown>
         <wcol_menu_back>
@@ -150,7 +150,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_menu_back>
         <wcol_pie_menu>
@@ -170,7 +170,7 @@
                              inner="#191919e6"
                              inner_sel="#2d2d2de6"
                              item="#646464ff"
-                             text="#ffffff"
+                             text="#929292"
                              text_sel="#ffffff"
                              show_shaded="FALSE"
                              shadetop="25"
@@ -186,7 +186,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_menu_item>
         <wcol_scroll>
@@ -198,7 +198,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_scroll>
         <wcol_progress>
@@ -210,7 +210,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_progress>
         <wcol_list_item>
@@ -222,7 +222,7 @@
                              text_sel="#ffffff"
                              show_shaded="TRUE"
                              shadetop="-100"
-                             shadedown="0">
+                             shadedown="5">
           </ThemeWidgetColors>
         </wcol_list_item>
         <wcol_state>
@@ -239,32 +239,35 @@
     </user_interface>
     <view_3d>
       <ThemeView3D grid="#222222"
+                   clipping_border_3d="#313131ff"
                    wire="#888888"
-                   wire_edit="#000000"
+                   wire_edit="#6c75ff"
                    gp_vertex="#000000"
                    gp_vertex_select="#ff8500"
                    gp_vertex_size="3"
-                   lamp="#c1d40028"
-                   speaker="#535353"
-                   camera="#000000"
-                   view_overlay="#000000"
-                   empty="#000000"
+                   text_grease_pencil="#b5e61d"
                    object_selected="#f15800"
                    object_active="#ff8c19"
                    object_grouped="#083008"
                    object_grouped_active="#55bb55"
-                   transform="#ffffff"
+                   text_keyframe="#ddd700"
+                   camera="#535353"
+                   empty="#535353"
+                   lamp="#fff0d328"
+                   speaker="#535353"
                    vertex="#72cfdd"
                    vertex_select="#ff8500"
                    vertex_size="3"
+                   vertex_bevel="#00a5ff"
                    vertex_unreferenced="#000000"
                    edge_select="#ffa000"
                    edge_seam="#db2512"
                    edge_sharp="#ff2020"
                    edge_crease="#cc0099"
+                   edge_bevel="#00a5ff"
                    edge_facesel="#6b6b6b"
                    freestyle_edge_mark="#7fff7f"
-                   face="#73828f12"
+                   face="#73828f41"
                    face_select="#ffa4003c"
                    face_dot="#ffa900"
                    facedot_size="4"
@@ -291,19 +294,18 @@
                    normal="#22dddd"
                    vertex_normal="#2361dd"
                    split_normal="#dd23dd"
-                   bone_solid="#c8c8c8"
                    bone_pose="#50c8ff"
                    bone_pose_active="#8cffff"
-                   frame_current="#60c040"
-                   outline_width="1"
+                   bone_solid="#c8c8c8"
                    bundle_solid="#c8c8c8"
                    camera_path="#5a5a5a"
                    skin_root="#000000"
-                   clipping_border_3d="#313131ff"
-                   text_keyframe="#ddd700"
-                   text_grease_pencil="#b5e61d"
+                   view_overlay="#000000"
+                   transform="#ffffff"
+                   frame_current="#60c040"
                    paint_curve_handle="#7fff7f7f"
-                   paint_curve_pivot="#ff7f7f7f">
+                   paint_curve_pivot="#ff7f7f7f"
+                   outline_width="1">
         <space>
           <ThemeSpaceGradient title="#5d5d5d"
                               text="#7d7d7d"
@@ -312,23 +314,23 @@
                               header_text="#979797"
                               header_text_hi="#ffffff"
                               button="#00000057"
-                              button_title="#c5c5c5"
+                              button_title="#929292"
                               button_text="#c3c3c3"
-                              button_text_hi="#ffffff"
+                              button_text_hi="#e5e5e5"
                               tab_active="#212947"
                               tab_inactive="#000000"
                               tab_back="#060606ff"
                               tab_outline="#000000">
             <gradients>
               <ThemeGradientColors show_grad="TRUE"
-                                   gradient="#0a0a0a"
+                                   gradient="#1d1d1d"
                                    high_gradient="#000000">
               </ThemeGradientColors>
             </gradients>
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -348,6 +350,7 @@
                         vertex="#ffffff"
                         vertex_select="#ff8500"
                         vertex_size="3"
+                        vertex_bevel="#000000"
                         vertex_unreferenced="#000000"
                         handle_free="#808080"
                         handle_auto="#909000"
@@ -382,7 +385,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -418,7 +421,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -463,7 +466,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -501,6 +504,7 @@
                       keyframe_jitter_selected="#61c042"
                       keyframe_border="#000000ff"
                       keyframe_border_selected="#000000ff"
+                      keyframe_scale_factor="1"
                       summary="#00000000">
         <space>
           <ThemeSpaceGeneric back="#080808"
@@ -521,7 +525,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -543,6 +547,7 @@
                         vertex="#0f13bb"
                         vertex_select="#ff8500"
                         vertex_size="3"
+                        vertex_bevel="#000000"
                         vertex_unreferenced="#000000"
                         face="#ffffff0a"
                         face_select="#ff85003c"
@@ -596,7 +601,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -644,7 +649,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -673,7 +678,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -712,7 +717,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -744,7 +749,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -799,7 +804,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -835,7 +840,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -865,7 +870,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -903,7 +908,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -932,7 +937,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -966,7 +971,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
@@ -1019,7 +1024,7 @@
             <panelcolors>
               <ThemePanelColors header="#00000019"
                                 back="#72727280"
-                                show_header="FALSE"
+                                show_header="TRUE"
                                 show_back="FALSE">
               </ThemePanelColors>
             </panelcolors>
diff --git a/release/scripts/presets/keyconfig/3dsmax.py b/release/scripts/presets/keyconfig/3dsmax.py
index 7694e338d68..6d05ff6982c 100644
--- a/release/scripts/presets/keyconfig/3dsmax.py
+++ b/release/scripts/presets/keyconfig/3dsmax.py
@@ -401,6 +401,12 @@ kmi = km.keymap_items.new('particle.hide', 'H', 'PRESS')
 kmi.properties.unselected = False
 kmi = km.keymap_items.new('particle.hide', 'H', 'PRESS', shift=True)
 kmi.properties.unselected = True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_planar_constraint= True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate= True
 kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
 kmi.properties.release_confirm = True
 kmi = km.keymap_items.new('particle.brush_edit', 'LEFTMOUSE', 'PRESS')
@@ -422,6 +428,12 @@ kmi.properties.value_2 = 'ENABLED'
 # Map 3D View
 km = kc.keymaps.new('3D View', space_type='VIEW_3D', region_type='WINDOW', modal=False)
 
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_planar_constraint= True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate= True
 kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
 kmi.properties.release_confirm = True
 kmi = km.keymap_items.new('view3d.cursor3d', 'ACTIONMOUSE', 'PRESS')
diff --git a/release/scripts/presets/keyconfig/maya.py b/release/scripts/presets/keyconfig/maya.py
index 67fd1fddcac..53129593a59 100644
--- a/release/scripts/presets/keyconfig/maya.py
+++ b/release/scripts/presets/keyconfig/maya.py
@@ -933,6 +933,12 @@ km = kc.keymaps.new('3D View', space_type='VIEW_3D', region_type='WINDOW', modal
 
 kmi = km.keymap_items.new('view3d.cursor3d', 'ACTIONMOUSE', 'PRESS')
 kmi = km.keymap_items.new('view3d.rotate', 'LEFTMOUSE', 'PRESS', alt=True)
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_planar_constraint = True
+kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', shift=True)
+kmi.properties.release_confirm = True
+kmi.properties.use_accurate = True
 kmi = km.keymap_items.new('view3d.manipulator', 'LEFTMOUSE', 'PRESS', any=True)
 kmi.properties.release_confirm = True
 kmi = km.keymap_items.new('view3d.move', 'MIDDLEMOUSE', 'PRESS', alt=True)
diff --git a/release/scripts/startup/bl_operators/__init__.py b/release/scripts/startup/bl_operators/__init__.py
index a696410ca1c..1e0dbe6925e 100644
--- a/release/scripts/startup/bl_operators/__init__.py
+++ b/release/scripts/startup/bl_operators/__init__.py
@@ -21,8 +21,7 @@
 # support reloading sub-modules
 if "bpy" in locals():
     from importlib import reload
-    for val in _modules_loaded:
-        reload(val)
+    _modules_loaded[:] = [reload(val) for val in _modules_loaded]
     del reload
 
 _modules = [
@@ -63,8 +62,15 @@ del _namespace
 
 
 def register():
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for mod in _modules_loaded:
+        for cls in mod.classes:
+            register_class(cls)
 
 
 def unregister():
-    bpy.utils.unregister_module(__name__)
+    from bpy.utils import unregister_class
+    for mod in reversed(_modules_loaded):
+        for cls in reversed(mod.classes):
+            if cls.is_registered:
+                unregister_class(cls)
diff --git a/release/scripts/startup/bl_operators/add_mesh_torus.py b/release/scripts/startup/bl_operators/add_mesh_torus.py
index 247b91e147f..0e5acea94f9 100644
--- a/release/scripts/startup/bl_operators/add_mesh_torus.py
+++ b/release/scripts/startup/bl_operators/add_mesh_torus.py
@@ -282,3 +282,8 @@ class AddTorus(Operator, object_utils.AddObjectHelper):
         object_utils.object_data_add(context, mesh, operator=self)
 
         return {'FINISHED'}
+
+
+classes = (
+    AddTorus,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/anim.py b/release/scripts/startup/bl_operators/anim.py
index c20d591241c..78fcf0dd124 100644
--- a/release/scripts/startup/bl_operators/anim.py
+++ b/release/scripts/startup/bl_operators/anim.py
@@ -28,11 +28,11 @@ if "bpy" in locals():
 import bpy
 from bpy.types import Operator
 from bpy.props import (
-        IntProperty,
-        BoolProperty,
-        EnumProperty,
-        StringProperty,
-        )
+    IntProperty,
+    BoolProperty,
+    EnumProperty,
+    StringProperty,
+)
 
 
 class ANIM_OT_keying_set_export(Operator):
@@ -115,30 +115,30 @@ class ANIM_OT_keying_set_export(Operator):
                 # Find material or lamp using this node tree...
                 id_bpy_path = "bpy.data.nodes[\"%s\"]"
                 found = False
-                
+
                 for mat in bpy.data.materials:
                     if mat.node_tree == ksp.id:
                         id_bpy_path = "bpy.data.materials[\"%s\"].node_tree" % (mat.name)
                         found = True
-                        break;
-                        
+                        break
+
                 if not found:
                     for lamp in bpy.data.lamps:
                         if lamp.node_tree == ksp.id:
                             id_bpy_path = "bpy.data.lamps[\"%s\"].node_tree" % (lamp.name)
                             found = True
-                            break;
-                    
+                            break
+
                 if not found:
-                    self.report({'WARN'}, "Could not find material or lamp using Shader Node Tree - %s" % (ksp.id))                    
+                    self.report({'WARN'}, "Could not find material or lamp using Shader Node Tree - %s" % (ksp.id))
             elif ksp.id.bl_rna.identifier.startswith("CompositorNodeTree"):
                 # Find compositor nodetree using this node tree...
                 for scene in bpy.data.scenes:
                     if scene.node_tree == ksp.id:
                         id_bpy_path = "bpy.data.scenes[\"%s\"].node_tree" % (scene.name)
-                        break;
+                        break
                 else:
-                    self.report({'WARN'}, "Could not find scene using Compositor Node Tree - %s" % (ksp.id)) 
+                    self.report({'WARN'}, "Could not find scene using Compositor Node Tree - %s" % (ksp.id))
             else:
                 idtype_list = ksp.id.bl_rna.name.lower() + "s"
                 id_bpy_path = "bpy.data.%s[\"%s\"]" % (idtype_list, ksp.id.name)
@@ -302,9 +302,11 @@ class ClearUselessActions(Operator):
     bl_label = "Clear Useless Actions"
     bl_options = {'REGISTER', 'UNDO'}
 
-    only_unused = BoolProperty(name="Only Unused",
+    only_unused = BoolProperty(
+            name="Only Unused",
             description="Only unused (Fake User only) actions get considered",
-            default=True)
+            default=True,
+            )
 
     @classmethod
     def poll(cls, context):
@@ -393,7 +395,7 @@ class UpdateAnimatedTransformConstraint(Operator):
                     except:
                         pass
                     ret = (data, new_path)
-                    #print(ret)
+                    # print(ret)
 
             return ret
 
@@ -412,3 +414,11 @@ class UpdateAnimatedTransformConstraint(Operator):
             text.from_string(log)
             self.report({'INFO'}, "Complete report available on '%s' text datablock" % text.name)
         return {'FINISHED'}
+
+
+classes = (
+    ANIM_OT_keying_set_export,
+    BakeAction,
+    ClearUselessActions,
+    UpdateAnimatedTransformConstraint,
+)
diff --git a/release/scripts/startup/bl_operators/clip.py b/release/scripts/startup/bl_operators/clip.py
index 0c77ea2ab7e..e52d577b900 100644
--- a/release/scripts/startup/bl_operators/clip.py
+++ b/release/scripts/startup/bl_operators/clip.py
@@ -21,7 +21,10 @@ import bpy
 import os
 from bpy.types import Operator
 from bpy.props import FloatProperty
-from mathutils import Vector, Matrix
+from mathutils import (
+    Vector,
+    Matrix,
+)
 
 
 def CLIP_spaces_walk(context, all_screens, tarea, tspace, callback, *args):
@@ -1071,3 +1074,17 @@ class CLIP_OT_track_settings_to_track(bpy.types.Operator):
                     setattr(marker_selected, attr, getattr(marker, attr))
 
         return {'FINISHED'}
+
+
+classes = (
+    CLIP_OT_bundles_to_mesh,
+    CLIP_OT_constraint_to_fcurve,
+    CLIP_OT_delete_proxy,
+    CLIP_OT_filter_tracks,
+    CLIP_OT_set_active_clip,
+    CLIP_OT_set_viewport_background,
+    CLIP_OT_setup_tracking_scene,
+    CLIP_OT_track_settings_as_default,
+    CLIP_OT_track_settings_to_track,
+    CLIP_OT_track_to_empty,
+)
diff --git a/release/scripts/startup/bl_operators/console.py b/release/scripts/startup/bl_operators/console.py
index 8cfc977294a..fb36f80239e 100644
--- a/release/scripts/startup/bl_operators/console.py
+++ b/release/scripts/startup/bl_operators/console.py
@@ -159,3 +159,12 @@ class ConsoleLanguage(Operator):
                                        remove_duplicates=True)
 
         return {'FINISHED'}
+
+
+classes = (
+    ConsoleAutocomplete,
+    ConsoleBanner,
+    ConsoleCopyAsScript,
+    ConsoleExec,
+    ConsoleLanguage,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/file.py b/release/scripts/startup/bl_operators/file.py
index 51e079164b6..d710b9af715 100644
--- a/release/scripts/startup/bl_operators/file.py
+++ b/release/scripts/startup/bl_operators/file.py
@@ -248,3 +248,8 @@ class WM_OT_previews_batch_clear(Operator):
 
         return {'FINISHED'}
 
+
+classes = (
+    WM_OT_previews_batch_clear,
+    WM_OT_previews_batch_generate,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/freestyle.py b/release/scripts/startup/bl_operators/freestyle.py
index fe3b00d54cd..2e46160aeeb 100644
--- a/release/scripts/startup/bl_operators/freestyle.py
+++ b/release/scripts/startup/bl_operators/freestyle.py
@@ -16,13 +16,15 @@
 #
 # ##### END GPL LICENSE BLOCK #####
 
+# <pep8 compliant>
+
 import bpy
 
 from bpy.props import (
-        BoolProperty,
-        EnumProperty,
-        StringProperty,
-        )
+    BoolProperty,
+    EnumProperty,
+    StringProperty,
+)
 
 
 class SCENE_OT_freestyle_fill_range_by_selection(bpy.types.Operator):
@@ -218,3 +220,11 @@ class SCENE_OT_freestyle_module_open(bpy.types.Operator):
         text = bpy.data.texts.load(self.filepath, self.make_internal)
         self.freestyle_module.script = text
         return {'FINISHED'}
+
+
+classes = (
+    SCENE_OT_freestyle_add_edge_marks_to_keying_set,
+    SCENE_OT_freestyle_add_face_marks_to_keying_set,
+    SCENE_OT_freestyle_fill_range_by_selection,
+    SCENE_OT_freestyle_module_open,
+)
diff --git a/release/scripts/startup/bl_operators/image.py b/release/scripts/startup/bl_operators/image.py
index d3460383fe7..6a538f0ae33 100644
--- a/release/scripts/startup/bl_operators/image.py
+++ b/release/scripts/startup/bl_operators/image.py
@@ -242,3 +242,11 @@ class ProjectApply(Operator):
         bpy.ops.paint.project_image(image=image_name)
 
         return {'FINISHED'}
+
+
+classes = (
+    EditExternally,
+    ProjectApply,
+    ProjectEdit,
+    SaveDirty,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/mask.py b/release/scripts/startup/bl_operators/mask.py
index aa984659430..78a4bd9af27 100644
--- a/release/scripts/startup/bl_operators/mask.py
+++ b/release/scripts/startup/bl_operators/mask.py
@@ -31,3 +31,8 @@ class MASK_MT_add(Menu):
         layout.operator_context = 'INVOKE_REGION_WIN'
         layout.operator("mask.primitive_circle_add", text="Circle", icon='MESH_CIRCLE')
         layout.operator("mask.primitive_square_add", text="Square", icon='MESH_PLANE')
+
+
+classes = (
+    MASK_MT_add,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/mesh.py b/release/scripts/startup/bl_operators/mesh.py
index 58eab5436e6..4edefd7bf9b 100644
--- a/release/scripts/startup/bl_operators/mesh.py
+++ b/release/scripts/startup/bl_operators/mesh.py
@@ -21,7 +21,10 @@
 import bpy
 from bpy.types import Operator
 
-from bpy.props import EnumProperty, IntProperty
+from bpy.props import (
+    EnumProperty,
+    IntProperty,
+)
 
 
 class MeshMirrorUV(Operator):
@@ -248,3 +251,10 @@ class MehsSetNormalsFromFaces(Operator):
 
         return {'FINISHED'}
 
+
+classes = (
+    MehsSetNormalsFromFaces,
+    MeshMirrorUV,
+    MeshSelectNext,
+    MeshSelectPrev,
+)
diff --git a/release/scripts/startup/bl_operators/node.py b/release/scripts/startup/bl_operators/node.py
index acff259e503..40876e2b069 100644
--- a/release/scripts/startup/bl_operators/node.py
+++ b/release/scripts/startup/bl_operators/node.py
@@ -21,16 +21,16 @@
 import bpy
 import nodeitems_utils
 from bpy.types import (
-        Operator,
-        PropertyGroup,
-        )
+    Operator,
+    PropertyGroup,
+)
 from bpy.props import (
-        BoolProperty,
-        CollectionProperty,
-        EnumProperty,
-        IntProperty,
-        StringProperty,
-        )
+    BoolProperty,
+    CollectionProperty,
+    EnumProperty,
+    IntProperty,
+    StringProperty,
+)
 
 
 class NodeSetting(PropertyGroup):
@@ -295,3 +295,14 @@ class NODE_OT_tree_path_parent(Operator):
         space.path.pop()
 
         return {'FINISHED'}
+
+
+classes = (
+    NodeSetting,
+
+    NODE_OT_add_and_link_node,
+    NODE_OT_add_node,
+    NODE_OT_add_search,
+    NODE_OT_collapse_hide_unused_toggle,
+    NODE_OT_tree_path_parent,
+)
diff --git a/release/scripts/startup/bl_operators/object.py b/release/scripts/startup/bl_operators/object.py
index 6356da216b1..3a42d8d2e78 100644
--- a/release/scripts/startup/bl_operators/object.py
+++ b/release/scripts/startup/bl_operators/object.py
@@ -21,12 +21,12 @@
 import bpy
 from bpy.types import Operator
 from bpy.props import (
-        StringProperty,
-        BoolProperty,
-        EnumProperty,
-        IntProperty,
-        FloatProperty,
-        )
+    BoolProperty,
+    EnumProperty,
+    FloatProperty,
+    IntProperty,
+    StringProperty,
+)
 
 
 class SelectPattern(Operator):
@@ -1034,3 +1034,22 @@ class LodGenerate(Operator):
         scene.objects.active = ob
 
         return {'FINISHED'}
+
+
+classes = (
+    ClearAllRestrictRender,
+    DupliOffsetFromCursor,
+    IsolateTypeRender,
+    JoinUVs,
+    LodByName,
+    LodClearAll,
+    LodGenerate,
+    MakeDupliFace,
+    SelectCamera,
+    SelectHierarchy,
+    SelectPattern,
+    ShapeTransfer,
+    SubdivisionSet,
+    TransformsToDeltas,
+    TransformsToDeltasAnim,
+)
diff --git a/release/scripts/startup/bl_operators/object_align.py b/release/scripts/startup/bl_operators/object_align.py
index a6ee16e6b71..1539ffb3545 100644
--- a/release/scripts/startup/bl_operators/object_align.py
+++ b/release/scripts/startup/bl_operators/object_align.py
@@ -26,13 +26,14 @@ from mathutils import Vector
 def GlobalBB_LQ(bb_world):
 
     # Initialize the variables with the 8th vertex
-    left, right, front, back, down, up = (bb_world[7][0],
-                                          bb_world[7][0],
-                                          bb_world[7][1],
-                                          bb_world[7][1],
-                                          bb_world[7][2],
-                                          bb_world[7][2],
-                                          )
+    left, right, front, back, down, up = (
+        bb_world[7][0],
+        bb_world[7][0],
+        bb_world[7][1],
+        bb_world[7][1],
+        bb_world[7][2],
+        bb_world[7][2],
+    )
 
     # Test against the other 7 verts
     for i in range(7):
@@ -398,16 +399,23 @@ class AlignObjects(Operator):
 
     def execute(self, context):
         align_axis = self.align_axis
-        ret = align_objects(context,
-                            'X' in align_axis,
-                            'Y' in align_axis,
-                            'Z' in align_axis,
-                            self.align_mode,
-                            self.relative_to,
-                            self.bb_quality)
+        ret = align_objects(
+            context,
+            'X' in align_axis,
+            'Y' in align_axis,
+            'Z' in align_axis,
+            self.align_mode,
+            self.relative_to,
+            self.bb_quality,
+        )
 
         if not ret:
             self.report({'WARNING'}, "No objects with bound-box selected")
             return {'CANCELLED'}
         else:
             return {'FINISHED'}
+
+
+classes = (
+    AlignObjects,
+)
diff --git a/release/scripts/startup/bl_operators/object_quick_effects.py b/release/scripts/startup/bl_operators/object_quick_effects.py
index ef10e279bb4..16f29c77bb9 100644
--- a/release/scripts/startup/bl_operators/object_quick_effects.py
+++ b/release/scripts/startup/bl_operators/object_quick_effects.py
@@ -22,12 +22,12 @@ from mathutils import Vector
 import bpy
 from bpy.types import Operator
 from bpy.props import (
-        BoolProperty,
-        EnumProperty,
-        IntProperty,
-        FloatProperty,
-        FloatVectorProperty,
-        )
+    BoolProperty,
+    EnumProperty,
+    IntProperty,
+    FloatProperty,
+    FloatVectorProperty,
+)
 
 
 def object_ensure_material(obj, mat_name):
@@ -319,7 +319,7 @@ class QuickSmoke(Operator):
 
     def execute(self, context):
         if not bpy.app.build_options.mod_smoke:
-            self.report({'ERROR'}, "Build without Smoke modifier support")
+            self.report({'ERROR'}, "Built without Smoke modifier support")
             return {'CANCELLED'}
 
         fake_context = context.copy()
@@ -568,7 +568,7 @@ class QuickFluid(Operator):
 
     def execute(self, context):
         if not bpy.app.build_options.mod_fluid:
-            self.report({'ERROR'}, "Build without Fluid modifier support")
+            self.report({'ERROR'}, "Built without Fluid modifier support")
             return {'CANCELLED'}
 
         fake_context = context.copy()
@@ -645,3 +645,11 @@ class QuickFluid(Operator):
             bpy.ops.fluid.bake('INVOKE_DEFAULT')
 
         return {'FINISHED'}
+
+
+classes = (
+    QuickExplode,
+    QuickFluid,
+    QuickFur,
+    QuickSmoke,
+)
diff --git a/release/scripts/startup/bl_operators/object_randomize_transform.py b/release/scripts/startup/bl_operators/object_randomize_transform.py
index 38110328603..f856b85844e 100644
--- a/release/scripts/startup/bl_operators/object_randomize_transform.py
+++ b/release/scripts/startup/bl_operators/object_randomize_transform.py
@@ -185,3 +185,8 @@ class RandomizeLocRotSize(Operator):
                            loc, rot, scale, scale_even, scale_min)
 
         return {'FINISHED'}
+
+
+classes = (
+    RandomizeLocRotSize,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/presets.py b/release/scripts/startup/bl_operators/presets.py
index e01e509b292..fdacf24f6e0 100644
--- a/release/scripts/startup/bl_operators/presets.py
+++ b/release/scripts/startup/bl_operators/presets.py
@@ -135,7 +135,7 @@ class AddPresetBase:
 
                             file_preset.write("%s = %r\n" % (rna_path_step, value))
 
-                    file_preset = open(filepath, 'w')
+                    file_preset = open(filepath, 'w', encoding="utf-8")
                     file_preset.write("import bpy\n")
 
                     if hasattr(self, "preset_defines"):
@@ -680,3 +680,26 @@ class AddPresetUnitsLength(AddPresetBase, Operator):
     ]
 
     preset_subdir = "units_length"
+
+
+classes = (
+    AddPresetCamera,
+    AddPresetCloth,
+    AddPresetFluid,
+    AddPresetHairDynamics,
+    AddPresetInteraction,
+    AddPresetInterfaceTheme,
+    AddPresetKeyconfig,
+    AddPresetNodeColor,
+    AddPresetOperator,
+    AddPresetRender,
+    AddPresetSSS,
+    AddPresetSafeAreas,
+    AddPresetSunSky,
+    AddPresetTrackingCamera,
+    AddPresetTrackingSettings,
+    AddPresetTrackingTrackColor,
+    AddPresetUnitsLength,
+    ExecutePreset,
+    WM_MT_operator_presets,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/rigidbody.py b/release/scripts/startup/bl_operators/rigidbody.py
index 750a5b0bf0f..6792d525683 100644
--- a/release/scripts/startup/bl_operators/rigidbody.py
+++ b/release/scripts/startup/bl_operators/rigidbody.py
@@ -20,8 +20,10 @@
 
 import bpy
 from bpy.types import Operator
-from bpy.props import IntProperty
-from bpy.props import EnumProperty
+from bpy.props import (
+    EnumProperty,
+    IntProperty,
+)
 
 
 class CopyRigidbodySettings(Operator):
@@ -309,3 +311,10 @@ class ConnectRigidBodies(Operator):
         else:
             self.report({'WARNING'}, "No other objects selected")
             return {'CANCELLED'}
+
+
+classes = (
+    BakeToKeyframes,
+    ConnectRigidBodies,
+    CopyRigidbodySettings,
+)
diff --git a/release/scripts/startup/bl_operators/screen_play_rendered_anim.py b/release/scripts/startup/bl_operators/screen_play_rendered_anim.py
index a5565699364..f4d6c7065a9 100644
--- a/release/scripts/startup/bl_operators/screen_play_rendered_anim.py
+++ b/release/scripts/startup/bl_operators/screen_play_rendered_anim.py
@@ -180,3 +180,8 @@ class PlayRenderedAnim(Operator):
             return {'CANCELLED'}
 
         return {'FINISHED'}
+
+
+classes = (
+    PlayRenderedAnim,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/sequencer.py b/release/scripts/startup/bl_operators/sequencer.py
index 31ca4249a9d..7209b6b478f 100644
--- a/release/scripts/startup/bl_operators/sequencer.py
+++ b/release/scripts/startup/bl_operators/sequencer.py
@@ -134,3 +134,10 @@ class SequencerDeinterlaceSelectedMovies(Operator):
                 s.use_deinterlace = True
 
         return {'FINISHED'}
+
+
+classes = (
+    SequencerCrossfadeSounds,
+    SequencerCutMulticam,
+    SequencerDeinterlaceSelectedMovies,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/uvcalc_follow_active.py b/release/scripts/startup/bl_operators/uvcalc_follow_active.py
index d1ac9e0b586..25ee5cafe81 100644
--- a/release/scripts/startup/bl_operators/uvcalc_follow_active.py
+++ b/release/scripts/startup/bl_operators/uvcalc_follow_active.py
@@ -248,3 +248,8 @@ class FollowActiveQuads(Operator):
     def invoke(self, context, event):
         wm = context.window_manager
         return wm.invoke_props_dialog(self)
+
+
+classes = (
+    FollowActiveQuads,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/uvcalc_lightmap.py b/release/scripts/startup/bl_operators/uvcalc_lightmap.py
index 3b095c883a3..8ee29d15d1b 100644
--- a/release/scripts/startup/bl_operators/uvcalc_lightmap.py
+++ b/release/scripts/startup/bl_operators/uvcalc_lightmap.py
@@ -668,3 +668,8 @@ class LightMapPack(Operator):
     def invoke(self, context, event):
         wm = context.window_manager
         return wm.invoke_props_dialog(self)
+
+
+classes = (
+    LightMapPack,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/uvcalc_smart_project.py b/release/scripts/startup/bl_operators/uvcalc_smart_project.py
index 52e7b0e0ae4..5581415c083 100644
--- a/release/scripts/startup/bl_operators/uvcalc_smart_project.py
+++ b/release/scripts/startup/bl_operators/uvcalc_smart_project.py
@@ -18,7 +18,11 @@
 
 # TODO <pep8 compliant>
 
-from mathutils import Matrix, Vector, geometry
+from mathutils import (
+    Matrix,
+    Vector,
+    geometry,
+)
 import bpy
 from bpy.types import Operator
 
@@ -1100,3 +1104,8 @@ class SmartProject(Operator):
     def invoke(self, context, event):
         wm = context.window_manager
         return wm.invoke_props_dialog(self)
+
+
+classes = (
+    SmartProject,
+)
diff --git a/release/scripts/startup/bl_operators/vertexpaint_dirt.py b/release/scripts/startup/bl_operators/vertexpaint_dirt.py
index 892e1822d68..c006e8e6e92 100644
--- a/release/scripts/startup/bl_operators/vertexpaint_dirt.py
+++ b/release/scripts/startup/bl_operators/vertexpaint_dirt.py
@@ -182,3 +182,8 @@ class VertexPaintDirt(Operator):
         ret = applyVertexDirt(mesh, self.blur_iterations, self.blur_strength, self.dirt_angle, self.clean_angle, self.dirt_only)
 
         return ret
+
+
+classes = (
+    VertexPaintDirt,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/view3d.py b/release/scripts/startup/bl_operators/view3d.py
index df4a93bb87f..acec2d8fe91 100644
--- a/release/scripts/startup/bl_operators/view3d.py
+++ b/release/scripts/startup/bl_operators/view3d.py
@@ -213,3 +213,11 @@ class VIEW3D_OT_select_or_deselect_all(Operator):
                                      enumerate=self.enumerate,
                                      object=self.object,
                                      location=(x, y))
+
+
+classes = (
+    VIEW3D_OT_edit_mesh_extrude_individual_move,
+    VIEW3D_OT_edit_mesh_extrude_move,
+    VIEW3D_OT_edit_mesh_extrude_shrink_fatten,
+    VIEW3D_OT_select_or_deselect_all,
+)
+\ No newline at end of file
diff --git a/release/scripts/startup/bl_operators/wm.py b/release/scripts/startup/bl_operators/wm.py
index f5460d58d44..d94951341c1 100644
--- a/release/scripts/startup/bl_operators/wm.py
+++ b/release/scripts/startup/bl_operators/wm.py
@@ -21,12 +21,12 @@
 import bpy
 from bpy.types import Operator
 from bpy.props import (
-        StringProperty,
-        BoolProperty,
-        IntProperty,
-        FloatProperty,
-        EnumProperty,
-        )
+    StringProperty,
+    BoolProperty,
+    IntProperty,
+    FloatProperty,
+    EnumProperty,
+)
 
 from bpy.app.translations import pgettext_tip as tip_
 
@@ -130,6 +130,20 @@ def execute_context_assign(self, context):
     return operator_path_undo_return(context, data_path)
 
 
+def module_filesystem_remove(path_base, module_name):
+    import os
+    module_name = os.path.splitext(module_name)[0]
+    for f in os.listdir(path_base):
+        f_base = os.path.splitext(f)[0]
+        if f_base == module_name:
+            f_full = os.path.join(path_base, f)
+
+            if os.path.isdir(f_full):
+                os.rmdir(f_full)
+            else:
+                os.remove(f_full)
+
+
 class BRUSH_OT_active_index_set(Operator):
     """Set active sculpt/paint brush from it's number"""
     bl_idname = "brush.active_index_set"
@@ -907,7 +921,10 @@ def _wm_doc_get_id(doc_id, do_url=True, url_prefix=""):
         # an operator (common case - just button referencing an op)
         if hasattr(bpy.types, class_name.upper() + "_OT_" + class_prop):
             if do_url:
-                url = ("%s/bpy.ops.%s.html#bpy.ops.%s.%s" % (url_prefix, class_name, class_name, class_prop))
+                url = (
+                    "%s/bpy.ops.%s.html#bpy.ops.%s.%s" %
+                    (url_prefix, class_name, class_name, class_prop)
+                )
             else:
                 rna = "bpy.ops.%s.%s" % (class_name, class_prop)
         else:
@@ -922,7 +939,10 @@ def _wm_doc_get_id(doc_id, do_url=True, url_prefix=""):
                 class_name, class_prop = class_name.split("_OT_", 1)
                 class_name = class_name.lower()
                 if do_url:
-                    url = ("%s/bpy.ops.%s.html#bpy.ops.%s.%s" % (url_prefix, class_name, class_name, class_prop))
+                    url = (
+                        "%s/bpy.ops.%s.html#bpy.ops.%s.%s" %
+                        (url_prefix, class_name, class_name, class_prop)
+                    )
                 else:
                     rna = "bpy.ops.%s.%s" % (class_name, class_prop)
             else:
@@ -938,9 +958,12 @@ def _wm_doc_get_id(doc_id, do_url=True, url_prefix=""):
                         rna_parent = rna_parent.base
 
                     if do_url:
-                        url = ("%s/bpy.types.%s.html#bpy.types.%s.%s" % (url_prefix, class_name, class_name, class_prop))
+                        url = (
+                            "%s/bpy.types.%s.html#bpy.types.%s.%s" %
+                            (url_prefix, class_name, class_name, class_prop)
+                        )
                     else:
-                        rna = ("bpy.types.%s.%s" % (class_name, class_prop))
+                        rna = "bpy.types.%s.%s" % (class_name, class_prop)
                 else:
                     # We assume this is custom property, only try to generate generic url/rna_id...
                     if do_url:
@@ -1008,11 +1031,9 @@ class WM_OT_doc_view(Operator):
 
     doc_id = doc_id
     if bpy.app.version_cycle == "release":
-        _prefix = ("https://www.blender.org/api/blender_python_api_%s%s_release" %
-                   ("_".join(str(v) for v in bpy.app.version[:2]), bpy.app.version_char))
+        _prefix = ("https://docs.blender.org/api/blender_python_api_current")
     else:
-        _prefix = ("https://www.blender.org/api/blender_python_api_%s" %
-                   "_".join(str(v) for v in bpy.app.version))
+        _prefix = ("https://docs.blender.org/api/blender_python_api_master")
 
     def execute(self, context):
         url = _wm_doc_get_id(self.doc_id, do_url=True, url_prefix=self._prefix)
@@ -1089,10 +1110,10 @@ class WM_OT_properties_edit(Operator):
 
     def execute(self, context):
         from rna_prop_ui import (
-                rna_idprop_ui_prop_get,
-                rna_idprop_ui_prop_clear,
-                rna_idprop_ui_prop_update,
-                )
+            rna_idprop_ui_prop_get,
+            rna_idprop_ui_prop_clear,
+            rna_idprop_ui_prop_update,
+        )
 
         data_path = self.data_path
         value = self.value
@@ -1269,9 +1290,9 @@ class WM_OT_properties_add(Operator):
 
     def execute(self, context):
         from rna_prop_ui import (
-                rna_idprop_ui_prop_get,
-                rna_idprop_ui_prop_update,
-                )
+            rna_idprop_ui_prop_get,
+            rna_idprop_ui_prop_update,
+        )
 
         data_path = self.data_path
         item = eval("context.%s" % data_path)
@@ -1286,10 +1307,10 @@ class WM_OT_properties_add(Operator):
 
             return prop_new
 
-        prop = unique_name(
-                {*item.keys(),
-                 *type(item).bl_rna.properties.keys(),
-                 })
+        prop = unique_name({
+            *item.keys(),
+            *type(item).bl_rna.properties.keys(),
+        })
 
         item[prop] = 1.0
         rna_idprop_ui_prop_update(item, prop)
@@ -1329,9 +1350,9 @@ class WM_OT_properties_remove(Operator):
 
     def execute(self, context):
         from rna_prop_ui import (
-                rna_idprop_ui_prop_clear,
-                rna_idprop_ui_prop_update,
-                )
+            rna_idprop_ui_prop_clear,
+            rna_idprop_ui_prop_update,
+        )
         data_path = self.data_path
         item = eval("context.%s" % data_path)
         prop = self.property
@@ -1369,7 +1390,10 @@ class WM_OT_appconfig_default(Operator):
         filepath = os.path.join(bpy.utils.preset_paths("interaction")[0], "blender.py")
 
         if os.path.exists(filepath):
-            bpy.ops.script.execute_preset(filepath=filepath, menu_idname="USERPREF_MT_interaction_presets")
+            bpy.ops.script.execute_preset(
+                filepath=filepath,
+                menu_idname="USERPREF_MT_interaction_presets",
+            )
 
         return {'FINISHED'}
 
@@ -1389,7 +1413,10 @@ class WM_OT_appconfig_activate(Operator):
         filepath = self.filepath.replace("keyconfig", "interaction")
 
         if os.path.exists(filepath):
-            bpy.ops.script.execute_preset(filepath=filepath, menu_idname="USERPREF_MT_interaction_presets")
+            bpy.ops.script.execute_preset(
+                filepath=filepath,
+                menu_idname="USERPREF_MT_interaction_presets",
+            )
 
         return {'FINISHED'}
 
@@ -1494,7 +1521,7 @@ class WM_OT_blenderplayer_start(Operator):
             "-g", "show_profile", "=", "%d" % gs.show_framerate_profile,
             "-g", "show_properties", "=", "%d" % gs.show_debug_properties,
             "-g", "ignore_deprecation_warnings", "=", "%d" % (not gs.use_deprecation_warnings),
-            ])
+        ])
 
         # finish the call with the path to the blend file
         args.append(filepath)
@@ -1624,10 +1651,11 @@ class WM_OT_keyconfig_export(Operator):
 
         wm = context.window_manager
 
-        keyconfig_utils.keyconfig_export(wm,
-                                         wm.keyconfigs.active,
-                                         self.filepath,
-                                         )
+        keyconfig_utils.keyconfig_export(
+            wm,
+            wm.keyconfigs.active,
+            self.filepath,
+        )
 
         return {'FINISHED'}
 
@@ -1892,7 +1920,10 @@ class WM_OT_theme_install(Operator):
 
         try:
             shutil.copyfile(xmlfile, path_dest)
-            bpy.ops.script.execute_preset(filepath=path_dest, menu_idname="USERPREF_MT_interface_theme_presets")
+            bpy.ops.script.execute_preset(
+                filepath=path_dest,
+                menu_idname="USERPREF_MT_interface_theme_presets",
+            )
 
         except:
             traceback.print_exc()
@@ -1919,10 +1950,12 @@ class WM_OT_addon_refresh(Operator):
         return {'FINISHED'}
 
 
+# Note: shares some logic with WM_OT_app_template_install
+# but not enough to de-duplicate. Fixed here may apply to both.
 class WM_OT_addon_install(Operator):
     "Install an add-on"
     bl_idname = "wm.addon_install"
-    bl_label = "Install from File..."
+    bl_label = "Install Add-on from File..."
 
     overwrite = BoolProperty(
             name="Overwrite",
@@ -1953,20 +1986,6 @@ class WM_OT_addon_install(Operator):
             options={'HIDDEN'},
             )
 
-    @staticmethod
-    def _module_remove(path_addons, module):
-        import os
-        module = os.path.splitext(module)[0]
-        for f in os.listdir(path_addons):
-            f_base = os.path.splitext(f)[0]
-            if f_base == module:
-                f_full = os.path.join(path_addons, f)
-
-                if os.path.isdir(f_full):
-                    os.rmdir(f_full)
-                else:
-                    os.remove(f_full)
-
     def execute(self, context):
         import addon_utils
         import traceback
@@ -2019,7 +2038,7 @@ class WM_OT_addon_install(Operator):
 
             if self.overwrite:
                 for f in file_to_extract.namelist():
-                    WM_OT_addon_install._module_remove(path_addons, f)
+                    module_filesystem_remove(path_addons, f)
             else:
                 for f in file_to_extract.namelist():
                     path_dest = os.path.join(path_addons, os.path.basename(f))
@@ -2037,7 +2056,7 @@ class WM_OT_addon_install(Operator):
             path_dest = os.path.join(path_addons, os.path.basename(pyfile))
 
             if self.overwrite:
-                WM_OT_addon_install._module_remove(path_addons, os.path.basename(pyfile))
+                module_filesystem_remove(path_addons, os.path.basename(pyfile))
             elif os.path.exists(path_dest):
                 self.report({'WARNING'}, "File already installed to %r\n" % path_dest)
                 return {'CANCELLED'}
@@ -2072,7 +2091,10 @@ class WM_OT_addon_install(Operator):
         bpy.utils.refresh_script_paths()
 
         # print message
-        msg = tip_("Modules Installed from %r into %r (%s)") % (pyfile, path_addons, ", ".join(sorted(addons_new)))
+        msg = (
+            tip_("Modules Installed (%s) from %r into %r") %
+            (", ".join(sorted(addons_new)), pyfile, path_addons)
+        )
         print(msg)
         self.report({'INFO'}, msg)
 
@@ -2166,6 +2188,7 @@ class WM_OT_addon_expand(Operator):
 
         return {'FINISHED'}
 
+
 class WM_OT_addon_userpref_show(Operator):
     "Show add-on user preferences"
     bl_idname = "wm.addon_userpref_show"
@@ -2194,3 +2217,160 @@ class WM_OT_addon_userpref_show(Operator):
             bpy.ops.screen.userpref_show('INVOKE_DEFAULT')
 
         return {'FINISHED'}
+
+
+# Note: shares some logic with WM_OT_addon_install
+# but not enough to de-duplicate. Fixes here may apply to both.
+class WM_OT_app_template_install(Operator):
+    "Install an application-template"
+    bl_idname = "wm.app_template_install"
+    bl_label = "Install Template from File..."
+
+    overwrite = BoolProperty(
+            name="Overwrite",
+            description="Remove existing template with the same ID",
+            default=True,
+            )
+
+    filepath = StringProperty(
+            subtype='FILE_PATH',
+            )
+    filter_folder = BoolProperty(
+            name="Filter folders",
+            default=True,
+            options={'HIDDEN'},
+            )
+    filter_glob = StringProperty(
+            default="*.zip",
+            options={'HIDDEN'},
+            )
+
+    def execute(self, context):
+        import traceback
+        import zipfile
+        import shutil
+        import os
+
+        filepath = self.filepath
+
+        path_app_templates = bpy.utils.user_resource(
+            'SCRIPTS', os.path.join("startup", "bl_app_templates_user"),
+            create=True,
+        )
+
+        if not path_app_templates:
+            self.report({'ERROR'}, "Failed to get add-ons path")
+            return {'CANCELLED'}
+
+        if not os.path.isdir(path_app_templates):
+            try:
+                os.makedirs(path_app_templates, exist_ok=True)
+            except:
+                traceback.print_exc()
+
+        app_templates_old = set(os.listdir(path_app_templates))
+
+        # check to see if the file is in compressed format (.zip)
+        if zipfile.is_zipfile(filepath):
+            try:
+                file_to_extract = zipfile.ZipFile(filepath, 'r')
+            except:
+                traceback.print_exc()
+                return {'CANCELLED'}
+
+            if self.overwrite:
+                for f in file_to_extract.namelist():
+                    module_filesystem_remove(path_app_templates, f)
+            else:
+                for f in file_to_extract.namelist():
+                    path_dest = os.path.join(path_app_templates, os.path.basename(f))
+                    if os.path.exists(path_dest):
+                        self.report({'WARNING'}, "File already installed to %r\n" % path_dest)
+                        return {'CANCELLED'}
+
+            try:  # extract the file to "bl_app_templates_user"
+                file_to_extract.extractall(path_app_templates)
+            except:
+                traceback.print_exc()
+                return {'CANCELLED'}
+
+        else:
+            # Only support installing zipfiles
+            self.report({'WARNING'}, "Expected a zip-file %r\n" % filepath)
+            return {'CANCELLED'}
+
+        app_templates_new = set(os.listdir(path_app_templates)) - app_templates_old
+
+        # in case a new module path was created to install this addon.
+        bpy.utils.refresh_script_paths()
+
+        # print message
+        msg = (
+            tip_("Template Installed (%s) from %r into %r") %
+            (", ".join(sorted(app_templates_new)), filepath, path_app_templates)
+        )
+        print(msg)
+        self.report({'INFO'}, msg)
+
+        return {'FINISHED'}
+
+    def invoke(self, context, event):
+        wm = context.window_manager
+        wm.fileselect_add(self)
+        return {'RUNNING_MODAL'}
+
+
+classes = (
+    BRUSH_OT_active_index_set,
+    WM_OT_addon_disable,
+    WM_OT_addon_enable,
+    WM_OT_addon_expand,
+    WM_OT_addon_install,
+    WM_OT_addon_refresh,
+    WM_OT_addon_remove,
+    WM_OT_addon_userpref_show,
+    WM_OT_app_template_install,
+    WM_OT_appconfig_activate,
+    WM_OT_appconfig_default,
+    WM_OT_blenderplayer_start,
+    WM_OT_context_collection_boolean_set,
+    WM_OT_context_cycle_array,
+    WM_OT_context_cycle_enum,
+    WM_OT_context_cycle_int,
+    WM_OT_context_menu_enum,
+    WM_OT_context_modal_mouse,
+    WM_OT_context_pie_enum,
+    WM_OT_context_scale_float,
+    WM_OT_context_scale_int,
+    WM_OT_context_set_boolean,
+    WM_OT_context_set_enum,
+    WM_OT_context_set_float,
+    WM_OT_context_set_id,
+    WM_OT_context_set_int,
+    WM_OT_context_set_string,
+    WM_OT_context_set_value,
+    WM_OT_context_toggle,
+    WM_OT_context_toggle_enum,
+    WM_OT_copy_prev_settings,
+    WM_OT_doc_view,
+    WM_OT_doc_view_manual,
+    WM_OT_keyconfig_activate,
+    WM_OT_keyconfig_export,
+    WM_OT_keyconfig_import,
+    WM_OT_keyconfig_remove,
+    WM_OT_keyconfig_test,
+    WM_OT_keyitem_add,
+    WM_OT_keyitem_remove,
+    WM_OT_keyitem_restore,
+    WM_OT_keymap_restore,
+    WM_OT_operator_cheat_sheet,
+    WM_OT_operator_pie_enum,
+    WM_OT_path_open,
+    WM_OT_properties_add,
+    WM_OT_properties_context_change,
+    WM_OT_properties_edit,
+    WM_OT_properties_remove,
+    WM_OT_sysinfo,
+    WM_OT_theme_install,
+    WM_OT_url_open,
+)
diff --git a/release/scripts/startup/bl_ui/__init__.py b/release/scripts/startup/bl_ui/__init__.py
index 2389be6787d..5b609605cee 100644
--- a/release/scripts/startup/bl_ui/__init__.py
+++ b/release/scripts/startup/bl_ui/__init__.py
@@ -23,8 +23,7 @@
 # support reloading sub-modules
 if "bpy" in locals():
     from importlib import reload
-    for val in _modules_loaded:
-        reload(val)
+    _modules_loaded[:] = [reload(val) for val in _modules_loaded]
     del reload
 
 _modules = [
@@ -94,7 +93,10 @@ del _namespace
 
 
 def register():
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for mod in _modules_loaded:
+        for cls in mod.classes:
+            register_class(cls)
 
     # space_userprefs.py
     from bpy.props import StringProperty, EnumProperty
@@ -143,8 +145,11 @@ def register():
 
 
 def unregister():
-    bpy.utils.unregister_module(__name__)
-
+    from bpy.utils import unregister_class
+    for mod in reversed(_modules_loaded):
+        for cls in reversed(mod.classes):
+            if cls.is_registered:
+                unregister_class(cls)
 
 # Define a default UIList, when a list does not need any custom drawing...
 # Keep in sync with its #defined name in UI_interface.h
diff --git a/release/scripts/startup/bl_ui/properties_animviz.py b/release/scripts/startup/bl_ui/properties_animviz.py
index 046b5eb2aa5..84bae18dd6f 100644
--- a/release/scripts/startup/bl_ui/properties_animviz.py
+++ b/release/scripts/startup/bl_ui/properties_animviz.py
@@ -139,6 +139,11 @@ class OnionSkinButtonsPanel:
         col.label(text="Display:")
         col.prop(arm, "show_only_ghost_selected", text="Selected Only")
 
+
+classes = (
+)
+
 if __name__ == "__main__":  # only for live edit.
-    import bpy
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_constraint.py b/release/scripts/startup/bl_ui/properties_constraint.py
index cb5f1595ff3..00892d5f85b 100644
--- a/release/scripts/startup/bl_ui/properties_constraint.py
+++ b/release/scripts/startup/bl_ui/properties_constraint.py
@@ -941,5 +941,12 @@ class BONE_PT_constraints(ConstraintButtonsPanel, Panel):
         for con in context.pose_bone.constraints:
             self.draw_constraint(context, con)
 
+classes = (
+    OBJECT_PT_constraints,
+    BONE_PT_constraints,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_armature.py b/release/scripts/startup/bl_ui/properties_data_armature.py
index 8261f0c7770..a2ecf984eb5 100644
--- a/release/scripts/startup/bl_ui/properties_data_armature.py
+++ b/release/scripts/startup/bl_ui/properties_data_armature.py
@@ -328,5 +328,21 @@ class DATA_PT_custom_props_arm(ArmatureButtonsPanel, PropertyPanel, Panel):
     _context_path = "object.data"
     _property_type = bpy.types.Armature
 
+
+classes = (
+    DATA_PT_context_arm,
+    DATA_PT_skeleton,
+    DATA_PT_display,
+    DATA_PT_bone_group_specials,
+    DATA_PT_bone_groups,
+    DATA_PT_pose_library,
+    DATA_PT_ghost,
+    DATA_PT_iksolver_itasc,
+    DATA_PT_motion_paths,
+    DATA_PT_custom_props_arm,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_bone.py b/release/scripts/startup/bl_ui/properties_data_bone.py
index 3a5475514c9..132c355ed99 100644
--- a/release/scripts/startup/bl_ui/properties_data_bone.py
+++ b/release/scripts/startup/bl_ui/properties_data_bone.py
@@ -449,5 +449,20 @@ class BONE_PT_custom_props(BoneButtonsPanel, PropertyPanel, Panel):
         else:
             return "active_bone"
 
+
+classes = (
+    BONE_PT_context_bone,
+    BONE_PT_transform,
+    BONE_PT_transform_locks,
+    BONE_PT_curved,
+    BONE_PT_relations,
+    BONE_PT_display,
+    BONE_PT_inverse_kinematics,
+    BONE_PT_deform,
+    BONE_PT_custom_props,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_camera.py b/release/scripts/startup/bl_ui/properties_data_camera.py
index c6fa77aec00..101062095c4 100644
--- a/release/scripts/startup/bl_ui/properties_data_camera.py
+++ b/release/scripts/startup/bl_ui/properties_data_camera.py
@@ -325,5 +325,20 @@ def draw_display_safe_settings(layout, safe_data, settings):
     col.prop(safe_data, "action_center", slider=True)
 
 
+classes = (
+    CAMERA_MT_presets,
+    SAFE_AREAS_MT_presets,
+    DATA_PT_context_camera,
+    DATA_PT_lens,
+    DATA_PT_camera,
+    DATA_PT_camera_stereoscopy,
+    DATA_PT_camera_dof,
+    DATA_PT_camera_display,
+    DATA_PT_camera_safe_areas,
+    DATA_PT_custom_props_camera,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_curve.py b/release/scripts/startup/bl_ui/properties_data_curve.py
index 33af812b6c4..ac1eb9505a6 100644
--- a/release/scripts/startup/bl_ui/properties_data_curve.py
+++ b/release/scripts/startup/bl_ui/properties_data_curve.py
@@ -431,5 +431,21 @@ class DATA_PT_custom_props_curve(CurveButtonsPanel, PropertyPanel, Panel):
     _context_path = "object.data"
     _property_type = bpy.types.Curve
 
+
+classes = (
+    DATA_PT_context_curve,
+    DATA_PT_shape_curve,
+    DATA_PT_curve_texture_space,
+    DATA_PT_geometry_curve,
+    DATA_PT_pathanim,
+    DATA_PT_active_spline,
+    DATA_PT_font,
+    DATA_PT_paragraph,
+    DATA_PT_text_boxes,
+    DATA_PT_custom_props_curve,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_empty.py b/release/scripts/startup/bl_ui/properties_data_empty.py
index a752d3517de..c66ff87ecbd 100644
--- a/release/scripts/startup/bl_ui/properties_data_empty.py
+++ b/release/scripts/startup/bl_ui/properties_data_empty.py
@@ -55,5 +55,12 @@ class DATA_PT_empty(DataButtonsPanel, Panel):
 
         layout.prop(ob, "empty_draw_size", text="Size")
 
+
+classes = (
+    DATA_PT_empty,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_lamp.py b/release/scripts/startup/bl_ui/properties_data_lamp.py
index 30cd39d2e2f..f913ef51381 100644
--- a/release/scripts/startup/bl_ui/properties_data_lamp.py
+++ b/release/scripts/startup/bl_ui/properties_data_lamp.py
@@ -394,5 +394,21 @@ class DATA_PT_custom_props_lamp(DataButtonsPanel, PropertyPanel, Panel):
     _context_path = "object.data"
     _property_type = bpy.types.Lamp
 
+
+classes = (
+    LAMP_MT_sunsky_presets,
+    DATA_PT_context_lamp,
+    DATA_PT_preview,
+    DATA_PT_lamp,
+    DATA_PT_sunsky,
+    DATA_PT_shadow,
+    DATA_PT_area,
+    DATA_PT_spot,
+    DATA_PT_falloff_curve,
+    DATA_PT_custom_props_lamp,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_lattice.py b/release/scripts/startup/bl_ui/properties_data_lattice.py
index cbfa6961e8c..4b3fd48c195 100644
--- a/release/scripts/startup/bl_ui/properties_data_lattice.py
+++ b/release/scripts/startup/bl_ui/properties_data_lattice.py
@@ -82,5 +82,14 @@ class DATA_PT_custom_props_lattice(DataButtonsPanel, PropertyPanel, Panel):
     _context_path = "object.data"
     _property_type = bpy.types.Lattice
 
+
+classes = (
+    DATA_PT_context_lattice,
+    DATA_PT_lattice,
+    DATA_PT_custom_props_lattice,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_mesh.py b/release/scripts/startup/bl_ui/properties_data_mesh.py
index 59907692fe0..de55b4152ba 100644
--- a/release/scripts/startup/bl_ui/properties_data_mesh.py
+++ b/release/scripts/startup/bl_ui/properties_data_mesh.py
@@ -392,5 +392,24 @@ class DATA_PT_custom_props_mesh(MeshButtonsPanel, PropertyPanel, Panel):
     _property_type = bpy.types.Mesh
 
 
+classes = (
+    MESH_MT_vertex_group_specials,
+    MESH_MT_shape_key_specials,
+    MESH_UL_vgroups,
+    MESH_UL_shape_keys,
+    MESH_UL_uvmaps_vcols,
+    DATA_PT_context_mesh,
+    DATA_PT_normals,
+    DATA_PT_texture_space,
+    DATA_PT_vertex_groups,
+    DATA_PT_shape_keys,
+    DATA_PT_uv_texture,
+    DATA_PT_vertex_colors,
+    DATA_PT_customdata,
+    DATA_PT_custom_props_mesh,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_metaball.py b/release/scripts/startup/bl_ui/properties_data_metaball.py
index b42258c386a..a621dc7210f 100644
--- a/release/scripts/startup/bl_ui/properties_data_metaball.py
+++ b/release/scripts/startup/bl_ui/properties_data_metaball.py
@@ -135,5 +135,16 @@ class DATA_PT_custom_props_metaball(DataButtonsPanel, PropertyPanel, Panel):
     _context_path = "object.data"
     _property_type = bpy.types.MetaBall
 
+
+classes = (
+    DATA_PT_context_metaball,
+    DATA_PT_metaball,
+    DATA_PT_mball_texture_space,
+    DATA_PT_metaball_element,
+    DATA_PT_custom_props_metaball,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_modifier.py b/release/scripts/startup/bl_ui/properties_data_modifier.py
index d66fb08bcd6..a37d61dd0af 100644
--- a/release/scripts/startup/bl_ui/properties_data_modifier.py
+++ b/release/scripts/startup/bl_ui/properties_data_modifier.py
@@ -569,6 +569,14 @@ class DATA_PT_modifiers(ModifierButtonsPanel, Panel):
         col.prop(md, "use_mirror_u", text="U")
         col.prop(md, "use_mirror_v", text="V")
 
+        col = layout.column(align=True)
+
+        if md.use_mirror_u:
+            col.prop(md, "mirror_offset_u")
+
+        if md.use_mirror_v:
+            col.prop(md, "mirror_offset_v")
+
         col = layout.column()
 
         if md.use_mirror_merge is True:
@@ -951,6 +959,23 @@ class DATA_PT_modifiers(ModifierButtonsPanel, Panel):
     def SURFACE(self, layout, ob, md):
         layout.label(text="Settings are inside the Physics tab")
 
+    def SURFACE_DEFORM(self, layout, ob, md):
+        col = layout.column()
+        col.active = not md.is_bound
+
+        col.prop(md, "target")
+        col.prop(md, "falloff")
+
+        layout.separator()
+
+        col = layout.column()
+        col.active = md.target is not None
+
+        if md.is_bound:
+            col.operator("object.surfacedeform_bind", text="Unbind")
+        else:
+            col.operator("object.surfacedeform_bind", text="Bind")
+
     def UV_PROJECT(self, layout, ob, md):
         split = layout.split()
 
@@ -1320,7 +1345,9 @@ class DATA_PT_modifiers(ModifierButtonsPanel, Panel):
         row.prop(md, "thickness_vertex_group", text="Factor")
 
         col.prop(md, "use_crease", text="Crease Edges")
-        col.prop(md, "crease_weight", text="Crease Weight")
+        row = col.row()
+        row.active = md.use_crease
+        row.prop(md, "crease_weight", text="Crease Weight")
 
         col = split.column()
 
@@ -1503,5 +1530,11 @@ class DATA_PT_modifiers(ModifierButtonsPanel, Panel):
             layout.operator("object.correctivesmooth_bind", text="Unbind" if is_bind else "Bind")
 
 
+classes = (
+    DATA_PT_modifiers,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_data_speaker.py b/release/scripts/startup/bl_ui/properties_data_speaker.py
index 5ed0f8c3265..eecb2690302 100644
--- a/release/scripts/startup/bl_ui/properties_data_speaker.py
+++ b/release/scripts/startup/bl_ui/properties_data_speaker.py
@@ -121,5 +121,16 @@ class DATA_PT_custom_props_speaker(DataButtonsPanel, PropertyPanel, Panel):
     _context_path = "object.data"
     _property_type = bpy.types.Speaker
 
+
+classes = (
+    DATA_PT_context_speaker,
+    DATA_PT_speaker,
+    DATA_PT_distance,
+    DATA_PT_cone,
+    DATA_PT_custom_props_speaker,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_freestyle.py b/release/scripts/startup/bl_ui/properties_freestyle.py
index e0620447185..9c5be7624e0 100644
--- a/release/scripts/startup/bl_ui/properties_freestyle.py
+++ b/release/scripts/startup/bl_ui/properties_freestyle.py
@@ -828,5 +828,17 @@ class MATERIAL_PT_freestyle_line(MaterialFreestyleButtonsPanel, Panel):
         row.prop(mat, "line_priority", text="Priority")
 
 
+classes = (
+    RENDER_PT_freestyle,
+    RENDERLAYER_UL_linesets,
+    RENDER_MT_lineset_specials,
+    RENDERLAYER_PT_freestyle,
+    RENDERLAYER_PT_freestyle_lineset,
+    RENDERLAYER_PT_freestyle_linestyle,
+    MATERIAL_PT_freestyle_line,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_game.py b/release/scripts/startup/bl_ui/properties_game.py
index ec061370fe5..46b34373aa5 100644
--- a/release/scripts/startup/bl_ui/properties_game.py
+++ b/release/scripts/startup/bl_ui/properties_game.py
@@ -852,5 +852,30 @@ class OBJECT_PT_levels_of_detail(ObjectButtonsPanel, Panel):
         row.menu("OBJECT_MT_lod_tools", text="", icon='TRIA_DOWN')
 
 
+classes = (
+    PHYSICS_PT_game_physics,
+    PHYSICS_PT_game_collision_bounds,
+    PHYSICS_PT_game_obstacles,
+    RENDER_PT_embedded,
+    RENDER_PT_game_player,
+    RENDER_PT_game_stereo,
+    RENDER_PT_game_shading,
+    RENDER_PT_game_system,
+    RENDER_PT_game_display,
+    SCENE_PT_game_physics,
+    SCENE_PT_game_physics_obstacles,
+    SCENE_PT_game_navmesh,
+    SCENE_PT_game_hysteresis,
+    WORLD_PT_game_context_world,
+    WORLD_PT_game_world,
+    WORLD_PT_game_environment_lighting,
+    WORLD_PT_game_mist,
+    DATA_PT_shadow_game,
+    OBJECT_MT_lod_tools,
+    OBJECT_PT_levels_of_detail,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_grease_pencil_common.py b/release/scripts/startup/bl_ui/properties_grease_pencil_common.py
index 4529c127839..42e8d5272b3 100644
--- a/release/scripts/startup/bl_ui/properties_grease_pencil_common.py
+++ b/release/scripts/startup/bl_ui/properties_grease_pencil_common.py
@@ -1155,3 +1155,24 @@ class GreasePencilToolsPanel:
         layout.separator()
 
         gpencil_stroke_placement_settings(context, layout)
+
+
+classes = (
+    GPENCIL_PIE_tool_palette,
+    GPENCIL_PIE_settings_palette,
+    GPENCIL_PIE_tools_more,
+    GPENCIL_PIE_sculpt,
+    GPENCIL_MT_snap,
+    GPENCIL_MT_gpencil_edit_specials,
+    GPENCIL_UL_brush,
+    GPENCIL_UL_palettecolor,
+    GPENCIL_UL_layer,
+    GPENCIL_MT_layer_specials,
+    GPENCIL_MT_brush_specials,
+    GPENCIL_MT_palettecolor_specials,
+)
+
+if __name__ == "__main__":  # only for live edit.
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_mask_common.py b/release/scripts/startup/bl_ui/properties_mask_common.py
index f4836da50bc..a7a67130f2f 100644
--- a/release/scripts/startup/bl_ui/properties_mask_common.py
+++ b/release/scripts/startup/bl_ui/properties_mask_common.py
@@ -392,5 +392,17 @@ class MASK_MT_select(Menu):
         layout.operator("mask.select_all", text="Inverse").action = 'INVERT'
         layout.operator("mask.select_linked", text="Select Linked")
 
+
+classes = (
+    MASK_UL_layers,
+    MASK_MT_mask,
+    MASK_MT_visibility,
+    MASK_MT_transform,
+    MASK_MT_animation,
+    MASK_MT_select,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_material.py b/release/scripts/startup/bl_ui/properties_material.py
index d916007ea41..06ae1847d06 100644
--- a/release/scripts/startup/bl_ui/properties_material.py
+++ b/release/scripts/startup/bl_ui/properties_material.py
@@ -71,6 +71,7 @@ class MATERIAL_MT_specials(Menu):
 
 
 class MATERIAL_UL_matslots(UIList):
+
     def draw_item(self, context, layout, data, item, icon, active_data, active_propname, index):
         # assert(isinstance(item, bpy.types.MaterialSlot)
         # ob = data
@@ -1052,5 +1053,38 @@ class MATERIAL_PT_custom_props(MaterialButtonsPanel, PropertyPanel, Panel):
     _context_path = "material"
     _property_type = bpy.types.Material
 
+
+classes = (
+    MATERIAL_MT_sss_presets,
+    MATERIAL_MT_specials,
+    MATERIAL_UL_matslots,
+    MATERIAL_PT_context_material,
+    MATERIAL_PT_preview,
+    MATERIAL_PT_pipeline,
+    MATERIAL_PT_diffuse,
+    MATERIAL_PT_specular,
+    MATERIAL_PT_shading,
+    MATERIAL_PT_transp,
+    MATERIAL_PT_mirror,
+    MATERIAL_PT_sss,
+    MATERIAL_PT_halo,
+    MATERIAL_PT_flare,
+    MATERIAL_PT_game_settings,
+    MATERIAL_PT_physics,
+    MATERIAL_PT_strand,
+    MATERIAL_PT_options,
+    MATERIAL_PT_shadow,
+    MATERIAL_PT_transp_game,
+    MATERIAL_PT_volume_density,
+    MATERIAL_PT_volume_shading,
+    MATERIAL_PT_volume_lighting,
+    MATERIAL_PT_volume_transp,
+    MATERIAL_PT_volume_integration,
+    MATERIAL_PT_volume_options,
+    MATERIAL_PT_custom_props,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_object.py b/release/scripts/startup/bl_ui/properties_object.py
index 3ff7a248c60..d7e18f81232 100644
--- a/release/scripts/startup/bl_ui/properties_object.py
+++ b/release/scripts/startup/bl_ui/properties_object.py
@@ -152,6 +152,33 @@ class OBJECT_PT_relations(ObjectButtonsPanel, Panel):
         sub.active = (parent is not None)
 
 
+class OBJECT_PT_relations_extras(ObjectButtonsPanel, Panel):
+    bl_label = "Relations Extras"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw(self, context):
+        layout = self.layout
+
+        ob = context.object
+
+        split = layout.split()
+
+        if context.scene.render.engine != 'BLENDER_GAME':
+            col = split.column()
+            col.label(text="Tracking Axes:")
+            col.prop(ob, "track_axis", text="Axis")
+            col.prop(ob, "up_axis", text="Up Axis")
+
+        col = split.column()
+        col.prop(ob, "use_slow_parent")
+        row = col.row()
+        row.active = ((ob.parent is not None) and (ob.use_slow_parent))
+        row.prop(ob, "slow_parent_offset", text="Offset")
+
+        layout.prop(ob, "use_extra_recalc_object")
+        layout.prop(ob, "use_extra_recalc_data")
+
+
 class GROUP_MT_specials(Menu):
     bl_label = "Group Specials"
 
@@ -296,33 +323,6 @@ class OBJECT_PT_duplication(ObjectButtonsPanel, Panel):
             layout.prop(ob, "dupli_group", text="Group")
 
 
-class OBJECT_PT_relations_extras(ObjectButtonsPanel, Panel):
-    bl_label = "Relations Extras"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    def draw(self, context):
-        layout = self.layout
-
-        ob = context.object
-
-        split = layout.split()
-
-        if context.scene.render.engine != 'BLENDER_GAME':
-            col = split.column()
-            col.label(text="Tracking Axes:")
-            col.prop(ob, "track_axis", text="Axis")
-            col.prop(ob, "up_axis", text="Up Axis")
-
-        col = split.column()
-        col.prop(ob, "use_slow_parent")
-        row = col.row()
-        row.active = ((ob.parent is not None) and (ob.use_slow_parent))
-        row.prop(ob, "slow_parent_offset", text="Offset")
-
-        layout.prop(ob, "use_extra_recalc_object")
-        layout.prop(ob, "use_extra_recalc_data")
-
-
 from bl_ui.properties_animviz import (
         MotionPathButtonsPanel,
         OnionSkinButtonsPanel,
@@ -366,5 +366,23 @@ class OBJECT_PT_custom_props(ObjectButtonsPanel, PropertyPanel, Panel):
     _context_path = "object"
     _property_type = bpy.types.Object
 
+
+classes = (
+    OBJECT_PT_context_object,
+    OBJECT_PT_transform,
+    OBJECT_PT_delta_transform,
+    OBJECT_PT_transform_locks,
+    OBJECT_PT_relations,
+    OBJECT_PT_relations_extras,
+    GROUP_MT_specials,
+    OBJECT_PT_groups,
+    OBJECT_PT_display,
+    OBJECT_PT_duplication,
+    OBJECT_PT_motion_paths,
+    OBJECT_PT_custom_props,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_paint_common.py b/release/scripts/startup/bl_ui/properties_paint_common.py
index 09a3a19cbce..dfd20d90ec4 100644
--- a/release/scripts/startup/bl_ui/properties_paint_common.py
+++ b/release/scripts/startup/bl_ui/properties_paint_common.py
@@ -309,3 +309,13 @@ def brush_mask_texture_settings(layout, brush):
     split = layout.split()
     split.prop(mask_tex_slot, "offset")
     split.prop(mask_tex_slot, "scale")
+
+
+classes = (
+    VIEW3D_MT_tools_projectpaint_clone,
+)
+
+if __name__ == "__main__":  # only for live edit.
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_particle.py b/release/scripts/startup/bl_ui/properties_particle.py
index 29fc56c3fad..2c27c68ed9d 100644
--- a/release/scripts/startup/bl_ui/properties_particle.py
+++ b/release/scripts/startup/bl_ui/properties_particle.py
@@ -23,11 +23,11 @@ from rna_prop_ui import PropertyPanel
 from bpy.app.translations import pgettext_iface as iface_
 
 from bl_ui.properties_physics_common import (
-        point_cache_ui,
-        effector_weights_ui,
-        basic_force_field_settings_ui,
-        basic_force_field_falloff_ui,
-        )
+    point_cache_ui,
+    effector_weights_ui,
+    basic_force_field_settings_ui,
+    basic_force_field_falloff_ui,
+)
 
 
 def particle_panel_enabled(context, psys):
@@ -108,6 +108,7 @@ def find_modifier(ob, psys):
 
 
 class PARTICLE_UL_particle_systems(bpy.types.UIList):
+
     def draw_item(self, context, layout, data, item, icon, active_data, active_propname, index, flt_flag):
         ob = data
         psys = item
@@ -117,8 +118,10 @@ class PARTICLE_UL_particle_systems(bpy.types.UIList):
 
             layout.prop(psys, "name", text="", emboss=False, icon_value=icon)
             if md:
-                layout.prop(md, "show_render", emboss=False, icon_only=True, icon='RESTRICT_RENDER_OFF' if md.show_render else 'RESTRICT_RENDER_ON')
-                layout.prop(md, "show_viewport", emboss=False, icon_only=True, icon='RESTRICT_VIEW_OFF' if md.show_viewport else 'RESTRICT_VIEW_ON')
+                layout.prop(md, "show_render", emboss=False, icon_only=True,
+                            icon='RESTRICT_RENDER_OFF' if md.show_render else 'RESTRICT_RENDER_ON')
+                layout.prop(md, "show_viewport", emboss=False, icon_only=True,
+                            icon='RESTRICT_VIEW_OFF' if md.show_viewport else 'RESTRICT_VIEW_ON')
 
         elif self.layout_type == 'GRID':
             layout.alignment = 'CENTER'
@@ -401,7 +404,8 @@ class PARTICLE_PT_hair_dynamics(ParticleButtonsPanel, Panel):
                 label = "ERROR"
                 icon = 'ERROR'
             box.label(label, icon=icon)
-            box.label("Iterations: %d .. %d (avg. %d)" % (result.min_iterations, result.max_iterations, result.avg_iterations))
+            box.label("Iterations: %d .. %d (avg. %d)" %
+                      (result.min_iterations, result.max_iterations, result.avg_iterations))
             box.label("Error: %.5f .. %.5f (avg. %.5f)" % (result.min_error, result.max_error, result.avg_error))
 
 
@@ -423,7 +427,12 @@ class PARTICLE_PT_cache(ParticleButtonsPanel, Panel):
         phystype = psys.settings.physics_type
         if phystype == 'NO' or phystype == 'KEYED':
             return False
-        return (psys.settings.type in {'EMITTER', 'REACTOR'} or (psys.settings.type == 'HAIR' and (psys.use_hair_dynamics or psys.point_cache.is_baked))) and engine in cls.COMPAT_ENGINES
+        return (
+            (psys.settings.type in {'EMITTER', 'REACTOR'} or
+             (psys.settings.type == 'HAIR' and
+              (psys.use_hair_dynamics or psys.point_cache.is_baked))) and
+            engine in cls.COMPAT_ENGINES
+        )
 
     def draw(self, context):
         psys = context.particle_system
@@ -743,7 +752,8 @@ class PARTICLE_PT_physics(ParticleButtonsPanel, Panel):
                 layout.label(text="Fluid interaction:")
 
             row = layout.row()
-            row.template_list("UI_UL_list", "particle_targets", psys, "targets", psys, "active_particle_target_index", rows=4)
+            row.template_list("UI_UL_list", "particle_targets", psys, "targets",
+                              psys, "active_particle_target_index", rows=4)
 
             col = row.column()
             sub = col.row()
@@ -760,7 +770,7 @@ class PARTICLE_PT_physics(ParticleButtonsPanel, Panel):
                 row = layout.row()
                 if part.physics_type == 'KEYED':
                     col = row.column()
-                    #doesn't work yet
+                    # doesn't work yet
                     #col.alert = key.valid
                     col.prop(key, "object", text="")
                     col.prop(key, "system", text="System")
@@ -770,7 +780,7 @@ class PARTICLE_PT_physics(ParticleButtonsPanel, Panel):
                     col.prop(key, "duration")
                 elif part.physics_type == 'BOIDS':
                     sub = row.row()
-                    #doesn't work yet
+                    # doesn't work yet
                     #sub.alert = key.valid
                     sub.prop(key, "object", text="")
                     sub.prop(key, "system", text="System")
@@ -778,7 +788,7 @@ class PARTICLE_PT_physics(ParticleButtonsPanel, Panel):
                     layout.prop(key, "alliance", expand=True)
                 elif part.physics_type == 'FLUID':
                     sub = row.row()
-                    #doesn't work yet
+                    # doesn't work yet
                     #sub.alert = key.valid
                     sub.prop(key, "object", text="")
                     sub.prop(key, "system", text="System")
@@ -831,7 +841,8 @@ class PARTICLE_PT_boidbrain(ParticleButtonsPanel, Panel):
             row.label(text="")
 
         row = layout.row()
-        row.template_list("UI_UL_list", "particle_boids_rules", state, "rules", state, "active_boid_rule_index", rows=4)
+        row.template_list("UI_UL_list", "particle_boids_rules", state,
+                          "rules", state, "active_boid_rule_index", rows=4)
 
         col = row.column()
         sub = col.row()
@@ -848,7 +859,7 @@ class PARTICLE_PT_boidbrain(ParticleButtonsPanel, Panel):
         if rule:
             row = layout.row()
             row.prop(rule, "name", text="")
-            #somebody make nice icons for boids here please! -jahka
+            # somebody make nice icons for boids here please! -jahka
             row.prop(rule, "use_in_air", icon='TRIA_UP', text="")
             row.prop(rule, "use_on_land", icon='TRIA_DOWN', text="")
 
@@ -1404,5 +1415,29 @@ class PARTICLE_PT_custom_props(ParticleButtonsPanel, PropertyPanel, Panel):
     _context_path = "particle_system.settings"
     _property_type = bpy.types.ParticleSettings
 
+
+classes = (
+    PARTICLE_MT_specials,
+    PARTICLE_MT_hair_dynamics_presets,
+    PARTICLE_UL_particle_systems,
+    PARTICLE_PT_context_particles,
+    PARTICLE_PT_emission,
+    PARTICLE_PT_hair_dynamics,
+    PARTICLE_PT_cache,
+    PARTICLE_PT_velocity,
+    PARTICLE_PT_rotation,
+    PARTICLE_PT_physics,
+    PARTICLE_PT_boidbrain,
+    PARTICLE_PT_render,
+    PARTICLE_PT_draw,
+    PARTICLE_PT_children,
+    PARTICLE_PT_field_weights,
+    PARTICLE_PT_force_fields,
+    PARTICLE_PT_vertexgroups,
+    PARTICLE_PT_custom_props,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_cloth.py b/release/scripts/startup/bl_ui/properties_physics_cloth.py
index 3ebf2691b4c..0a7890670b2 100644
--- a/release/scripts/startup/bl_ui/properties_physics_cloth.py
+++ b/release/scripts/startup/bl_ui/properties_physics_cloth.py
@@ -256,5 +256,17 @@ class PHYSICS_PT_cloth_field_weights(PhysicButtonsPanel, Panel):
         cloth = context.cloth.settings
         effector_weights_ui(self, context, cloth.effector_weights, 'CLOTH')
 
+classes = (
+    CLOTH_MT_presets,
+    PHYSICS_PT_cloth,
+    PHYSICS_PT_cloth_cache,
+    PHYSICS_PT_cloth_collision,
+    PHYSICS_PT_cloth_stiffness,
+    PHYSICS_PT_cloth_sewing,
+    PHYSICS_PT_cloth_field_weights,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_common.py b/release/scripts/startup/bl_ui/properties_physics_common.py
index 277b59d187d..0b98d8738dc 100644
--- a/release/scripts/startup/bl_ui/properties_physics_common.py
+++ b/release/scripts/startup/bl_ui/properties_physics_common.py
@@ -274,6 +274,8 @@ def basic_force_field_settings_ui(self, context, field):
         col.prop(field, "use_global_coords", text="Global")
     elif field.type == 'HARMONIC':
         col.prop(field, "use_multiple_springs")
+    if field.type == 'FORCE':
+        col.prop(field, "use_gravity_falloff",  text="Gravitation")
 
     split = layout.split()
 
@@ -316,5 +318,12 @@ def basic_force_field_falloff_ui(self, context, field):
     sub.active = field.use_max_distance
     sub.prop(field, "distance_max", text="Maximum")
 
+
+classes = (
+    PHYSICS_PT_add,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_dynamicpaint.py b/release/scripts/startup/bl_ui/properties_physics_dynamicpaint.py
index 6c3a3246cf6..c813350be08 100644
--- a/release/scripts/startup/bl_ui/properties_physics_dynamicpaint.py
+++ b/release/scripts/startup/bl_ui/properties_physics_dynamicpaint.py
@@ -529,12 +529,20 @@ class PHYSICS_PT_dp_brush_wave(PhysicButtonsPanel, Panel):
             row.prop(brush, "wave_clamp")
 
 
-def register():
-    bpy.utils.register_module(__name__)
-
-
-def unregister():
-    bpy.utils.register_module(__name__)
-
-if __name__ == "__main__":
-    register()
+classes = (
+    PHYSICS_UL_dynapaint_surfaces,
+    PHYSICS_PT_dynamic_paint,
+    PHYSICS_PT_dp_advanced_canvas,
+    PHYSICS_PT_dp_canvas_output,
+    PHYSICS_PT_dp_canvas_initial_color,
+    PHYSICS_PT_dp_effects,
+    PHYSICS_PT_dp_cache,
+    PHYSICS_PT_dp_brush_source,
+    PHYSICS_PT_dp_brush_velocity,
+    PHYSICS_PT_dp_brush_wave,
+)
+
+if __name__ == "__main__":  # only for live edit.
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_field.py b/release/scripts/startup/bl_ui/properties_physics_field.py
index 1cebc0496b0..2b12fcf982d 100644
--- a/release/scripts/startup/bl_ui/properties_physics_field.py
+++ b/release/scripts/startup/bl_ui/properties_physics_field.py
@@ -228,5 +228,13 @@ class PHYSICS_PT_collision(PhysicButtonsPanel, Panel):
             col.label(text="Force Fields:")
             col.prop(settings, "absorption", text="Absorption")
 
+
+classes = (
+    PHYSICS_PT_field,
+    PHYSICS_PT_collision,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_fluid.py b/release/scripts/startup/bl_ui/properties_physics_fluid.py
index 5d7034c2e68..ab92370f9ae 100644
--- a/release/scripts/startup/bl_ui/properties_physics_fluid.py
+++ b/release/scripts/startup/bl_ui/properties_physics_fluid.py
@@ -308,5 +308,16 @@ class PHYSICS_PT_domain_particles(PhysicButtonsPanel, Panel):
         row.prop(fluid, "tracer_particles", text="Tracer")
         row.prop(fluid, "generate_particles", text="Generate")
 
+
+classes = (
+    FLUID_MT_presets,
+    PHYSICS_PT_fluid,
+    PHYSICS_PT_domain_gravity,
+    PHYSICS_PT_domain_boundary,
+    PHYSICS_PT_domain_particles,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_rigidbody.py b/release/scripts/startup/bl_ui/properties_physics_rigidbody.py
index eeb21c046bf..6afdd800b88 100644
--- a/release/scripts/startup/bl_ui/properties_physics_rigidbody.py
+++ b/release/scripts/startup/bl_ui/properties_physics_rigidbody.py
@@ -137,5 +137,14 @@ class PHYSICS_PT_rigid_body_dynamics(PHYSICS_PT_rigidbody_panel, Panel):
         col.prop(rbo, "linear_damping", text="Translation")
         col.prop(rbo, "angular_damping", text="Rotation")
 
+
+classes = (
+    PHYSICS_PT_rigid_body,
+    PHYSICS_PT_rigid_body_collisions,
+    PHYSICS_PT_rigid_body_dynamics,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_rigidbody_constraint.py b/release/scripts/startup/bl_ui/properties_physics_rigidbody_constraint.py
index 9d4f51b256b..84a4cbb4b68 100644
--- a/release/scripts/startup/bl_ui/properties_physics_rigidbody_constraint.py
+++ b/release/scripts/startup/bl_ui/properties_physics_rigidbody_constraint.py
@@ -260,5 +260,11 @@ class PHYSICS_PT_rigid_body_constraint(PHYSICS_PT_rigidbody_constraint_panel, Pa
                 sub.prop(rbc, "spring_damping_ang_z", text="Damping")
 
 
+classes = (
+    PHYSICS_PT_rigid_body_constraint,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_smoke.py b/release/scripts/startup/bl_ui/properties_physics_smoke.py
index ee9135b9dbf..f2e6c1e22e3 100644
--- a/release/scripts/startup/bl_ui/properties_physics_smoke.py
+++ b/release/scripts/startup/bl_ui/properties_physics_smoke.py
@@ -21,9 +21,9 @@ import bpy
 from bpy.types import Panel
 
 from bl_ui.properties_physics_common import (
-        point_cache_ui,
-        effector_weights_ui,
-        )
+    point_cache_ui,
+    effector_weights_ui,
+)
 
 
 class PhysicButtonsPanel:
@@ -380,23 +380,23 @@ class PHYSICS_PT_smoke_display_settings(PhysicButtonsPanel, Panel):
         do_axis_slicing = (slice_method == 'AXIS_ALIGNED')
         do_full_slicing = (axis_slice_method == 'FULL')
 
-        row = layout.row();
+        row = layout.row()
         row.enabled = do_axis_slicing
         row.prop(domain, "axis_slice_method")
 
-        col = layout.column();
+        col = layout.column()
         col.enabled = not do_full_slicing and do_axis_slicing
         col.prop(domain, "slice_axis")
         col.prop(domain, "slice_depth")
 
-        row = layout.row();
+        row = layout.row()
         row.enabled = do_full_slicing or not do_axis_slicing
         row.prop(domain, "slice_per_voxel")
 
         layout.separator()
         layout.label(text="Debug:")
         layout.prop(domain, "draw_velocity")
-        col = layout.column();
+        col = layout.column()
         col.enabled = domain.draw_velocity
         col.prop(domain, "vector_draw_type")
         col.prop(domain, "vector_scale")
@@ -404,11 +404,25 @@ class PHYSICS_PT_smoke_display_settings(PhysicButtonsPanel, Panel):
         layout.separator()
         layout.label(text="Color Mapping:")
         layout.prop(domain, "use_color_ramp")
-        col = layout.column();
+        col = layout.column()
         col.enabled = domain.use_color_ramp
         col.prop(domain, "coba_field")
         col.template_color_ramp(domain, "color_ramp", expand=True)
 
 
+classes = (
+    PHYSICS_PT_smoke,
+    PHYSICS_PT_smoke_flow_advanced,
+    PHYSICS_PT_smoke_fire,
+    PHYSICS_PT_smoke_adaptive_domain,
+    PHYSICS_PT_smoke_highres,
+    PHYSICS_PT_smoke_groups,
+    PHYSICS_PT_smoke_cache,
+    PHYSICS_PT_smoke_field_weights,
+    PHYSICS_PT_smoke_display_settings,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_physics_softbody.py b/release/scripts/startup/bl_ui/properties_physics_softbody.py
index a458af739f2..5960428e4ae 100644
--- a/release/scripts/startup/bl_ui/properties_physics_softbody.py
+++ b/release/scripts/startup/bl_ui/properties_physics_softbody.py
@@ -244,5 +244,18 @@ class PHYSICS_PT_softbody_field_weights(PhysicButtonsPanel, Panel):
 
         effector_weights_ui(self, context, softbody.effector_weights, 'SOFTBODY')
 
+
+classes = (
+    PHYSICS_PT_softbody,
+    PHYSICS_PT_softbody_cache,
+    PHYSICS_PT_softbody_goal,
+    PHYSICS_PT_softbody_edge,
+    PHYSICS_PT_softbody_collision,
+    PHYSICS_PT_softbody_solver,
+    PHYSICS_PT_softbody_field_weights,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_render.py b/release/scripts/startup/bl_ui/properties_render.py
index 6d23c07152e..a7e8d9273ad 100644
--- a/release/scripts/startup/bl_ui/properties_render.py
+++ b/release/scripts/startup/bl_ui/properties_render.py
@@ -584,5 +584,24 @@ class RENDER_PT_bake(RenderButtonsPanel, Panel):
             sub.prop(rd, "bake_user_scale", text="User Scale")
 
 
+classes = (
+    RENDER_MT_presets,
+    RENDER_MT_ffmpeg_presets,
+    RENDER_MT_framerate_presets,
+    RENDER_PT_render,
+    RENDER_PT_dimensions,
+    RENDER_PT_antialiasing,
+    RENDER_PT_motion_blur,
+    RENDER_PT_shading,
+    RENDER_PT_performance,
+    RENDER_PT_post_processing,
+    RENDER_PT_stamp,
+    RENDER_PT_output,
+    RENDER_PT_encoding,
+    RENDER_PT_bake,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_render_layer.py b/release/scripts/startup/bl_ui/properties_render_layer.py
index 9b8bc237db9..084bc387822 100644
--- a/release/scripts/startup/bl_ui/properties_render_layer.py
+++ b/release/scripts/startup/bl_ui/properties_render_layer.py
@@ -227,5 +227,16 @@ class RENDERLAYER_PT_views(RenderLayerButtonsPanel, Panel):
             row.prop(rv, "camera_suffix", text="")
 
 
+classes = (
+    RENDERLAYER_UL_renderlayers,
+    RENDERLAYER_PT_layers,
+    RENDERLAYER_PT_layer_options,
+    RENDERLAYER_PT_layer_passes,
+    RENDERLAYER_UL_renderviews,
+    RENDERLAYER_PT_views,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_scene.py b/release/scripts/startup/bl_ui/properties_scene.py
index d6253ec7fbc..fb93c0b56f2 100644
--- a/release/scripts/startup/bl_ui/properties_scene.py
+++ b/release/scripts/startup/bl_ui/properties_scene.py
@@ -443,5 +443,25 @@ class SCENE_PT_custom_props(SceneButtonsPanel, PropertyPanel, Panel):
     _context_path = "scene"
     _property_type = bpy.types.Scene
 
+
+classes = (
+    SCENE_MT_units_length_presets,
+    SCENE_UL_keying_set_paths,
+    SCENE_PT_scene,
+    SCENE_PT_unit,
+    SCENE_PT_keying_sets,
+    SCENE_PT_keying_set_paths,
+    SCENE_PT_color_management,
+    SCENE_PT_audio,
+    SCENE_PT_physics,
+    SCENE_PT_rigid_body_world,
+    SCENE_PT_rigid_body_cache,
+    SCENE_PT_rigid_body_field_weights,
+    SCENE_PT_simplify,
+    SCENE_PT_custom_props,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_texture.py b/release/scripts/startup/bl_ui/properties_texture.py
index caf19a9e469..d05527b7ef6 100644
--- a/release/scripts/startup/bl_ui/properties_texture.py
+++ b/release/scripts/startup/bl_ui/properties_texture.py
@@ -21,15 +21,15 @@ import bpy
 from bpy.types import Menu, Panel, UIList
 
 from bpy.types import (
-        Brush,
-        FreestyleLineStyle,
-        Lamp,
-        Material,
-        Object,
-        ParticleSettings,
-        Texture,
-        World,
-        )
+    Brush,
+    FreestyleLineStyle,
+    Lamp,
+    Material,
+    Object,
+    ParticleSettings,
+    Texture,
+    World,
+)
 
 from rna_prop_ui import PropertyPanel
 
@@ -60,6 +60,7 @@ class TEXTURE_MT_envmap_specials(Menu):
 
 
 class TEXTURE_UL_texslots(UIList):
+
     def draw_item(self, context, layout, data, item, icon, active_data, active_propname, index):
         # assert(isinstance(item, bpy.types.MaterialTextureSlot)
         ma = data
@@ -198,7 +199,8 @@ class TEXTURE_PT_context_texture(TextureButtonsPanel, Panel):
         if tex_collection:
             row = layout.row()
 
-            row.template_list("TEXTURE_UL_texslots", "", idblock, "texture_slots", idblock, "active_texture_index", rows=2)
+            row.template_list("TEXTURE_UL_texslots", "", idblock, "texture_slots",
+                              idblock, "active_texture_index", rows=2)
 
             col = row.column(align=True)
             col.operator("texture.slot_move", text="", icon='TRIA_UP').type = 'UP'
@@ -1210,12 +1212,22 @@ class TEXTURE_PT_influence(TextureSlotPanel, Panel):
             row = layout.row()
 
             sub = row.row()
-            sub.active = (tex.use_map_normal or tex.use_map_warp) and not (tex.texture.type == 'IMAGE' and (tex.texture.use_normal_map or tex.texture.use_derivative_map))
+            sub.active = (
+                (tex.use_map_normal or tex.use_map_warp) and
+                not (tex.texture.type == 'IMAGE' and
+                     (tex.texture.use_normal_map or tex.texture.use_derivative_map))
+            )
             sub.prop(tex, "bump_method", text="Method")
 
-            # the space setting is supported for: derivative-maps + bump-maps (DEFAULT,BEST_QUALITY), not for normal-maps
+            # the space setting is supported for: derivative-maps + bump-maps
+            # (DEFAULT,BEST_QUALITY), not for normal-maps
             sub = row.row()
-            sub.active = (tex.use_map_normal or tex.use_map_warp) and not (tex.texture.type == 'IMAGE' and tex.texture.use_normal_map) and ((tex.bump_method in {'BUMP_LOW_QUALITY', 'BUMP_MEDIUM_QUALITY', 'BUMP_BEST_QUALITY'}) or (tex.texture.type == 'IMAGE' and tex.texture.use_derivative_map))
+            sub.active = (
+                (tex.use_map_normal or tex.use_map_warp) and
+                not (tex.texture.type == 'IMAGE' and tex.texture.use_normal_map) and
+                ((tex.bump_method in {'BUMP_LOW_QUALITY', 'BUMP_MEDIUM_QUALITY', 'BUMP_BEST_QUALITY'}) or
+                 (tex.texture.type == 'IMAGE' and tex.texture.use_derivative_map))
+            )
             sub.prop(tex, "bump_objectspace", text="Space")
 
 
@@ -1224,5 +1236,38 @@ class TEXTURE_PT_custom_props(TextureButtonsPanel, PropertyPanel, Panel):
     _context_path = "texture"
     _property_type = Texture
 
+
+classes = (
+    TEXTURE_MT_specials,
+    TEXTURE_MT_envmap_specials,
+    TEXTURE_UL_texslots,
+    TEXTURE_PT_context_texture,
+    TEXTURE_PT_preview,
+    TEXTURE_PT_colors,
+    TEXTURE_PT_clouds,
+    TEXTURE_PT_wood,
+    TEXTURE_PT_marble,
+    TEXTURE_PT_magic,
+    TEXTURE_PT_blend,
+    TEXTURE_PT_stucci,
+    TEXTURE_PT_image,
+    TEXTURE_PT_image_sampling,
+    TEXTURE_PT_image_mapping,
+    TEXTURE_PT_envmap,
+    TEXTURE_PT_envmap_sampling,
+    TEXTURE_PT_musgrave,
+    TEXTURE_PT_voronoi,
+    TEXTURE_PT_distortednoise,
+    TEXTURE_PT_voxeldata,
+    TEXTURE_PT_pointdensity,
+    TEXTURE_PT_pointdensity_turbulence,
+    TEXTURE_PT_ocean,
+    TEXTURE_PT_mapping,
+    TEXTURE_PT_influence,
+    TEXTURE_PT_custom_props,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/properties_world.py b/release/scripts/startup/bl_ui/properties_world.py
index e07349a99ee..6aa39580d34 100644
--- a/release/scripts/startup/bl_ui/properties_world.py
+++ b/release/scripts/startup/bl_ui/properties_world.py
@@ -248,5 +248,20 @@ class WORLD_PT_custom_props(WorldButtonsPanel, PropertyPanel, Panel):
     _context_path = "world"
     _property_type = bpy.types.World
 
+
+classes = (
+    WORLD_PT_context_world,
+    WORLD_PT_preview,
+    WORLD_PT_world,
+    WORLD_PT_ambient_occlusion,
+    WORLD_PT_environment_lighting,
+    WORLD_PT_indirect_lighting,
+    WORLD_PT_gather,
+    WORLD_PT_mist,
+    WORLD_PT_custom_props,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_clip.py b/release/scripts/startup/bl_ui/space_clip.py
index 5cc135392c1..3f05620fcf6 100644
--- a/release/scripts/startup/bl_ui/space_clip.py
+++ b/release/scripts/startup/bl_ui/space_clip.py
@@ -1486,5 +1486,69 @@ class CLIP_MT_stabilize_2d_rotation_specials(Menu):
         layout.operator("clip.stabilize_2d_rotation_select")
 
 
+classes = (
+    CLIP_UL_tracking_objects,
+    CLIP_HT_header,
+    CLIP_MT_track,
+    CLIP_MT_tracking_editor_menus,
+    CLIP_MT_masking_editor_menus,
+    CLIP_PT_track,
+    CLIP_PT_tools_clip,
+    CLIP_PT_tools_marker,
+    CLIP_PT_tracking_settings,
+    CLIP_PT_tools_tracking,
+    CLIP_PT_tools_plane_tracking,
+    CLIP_PT_tools_solve,
+    CLIP_PT_tools_cleanup,
+    CLIP_PT_tools_geometry,
+    CLIP_PT_tools_orientation,
+    CLIP_PT_tools_object,
+    CLIP_PT_objects,
+    CLIP_PT_plane_track,
+    CLIP_PT_track_settings,
+    CLIP_PT_tracking_camera,
+    CLIP_PT_tracking_lens,
+    CLIP_PT_display,
+    CLIP_PT_marker,
+    CLIP_PT_marker_display,
+    CLIP_PT_stabilization,
+    CLIP_PT_proxy,
+    CLIP_PT_mask,
+    CLIP_PT_mask_layers,
+    CLIP_PT_mask_display,
+    CLIP_PT_active_mask_spline,
+    CLIP_PT_active_mask_point,
+    CLIP_PT_tools_mask,
+    CLIP_PT_tools_mask_add,
+    CLIP_PT_tools_mask_transforms,
+    CLIP_PT_footage,
+    CLIP_PT_footage_info,
+    CLIP_PT_tools_scenesetup,
+    CLIP_PT_grease_pencil,
+    CLIP_PT_grease_pencil_palettecolor,
+    CLIP_PT_tools_grease_pencil_draw,
+    CLIP_PT_tools_grease_pencil_edit,
+    CLIP_PT_tools_grease_pencil_sculpt,
+    CLIP_PT_tools_grease_pencil_brush,
+    CLIP_PT_tools_grease_pencil_brushcurves,
+    CLIP_MT_view,
+    CLIP_MT_clip,
+    CLIP_MT_proxy,
+    CLIP_MT_reconstruction,
+    CLIP_MT_track_visibility,
+    CLIP_MT_track_transform,
+    CLIP_MT_select,
+    CLIP_MT_select_grouped,
+    CLIP_MT_tracking_specials,
+    CLIP_MT_camera_presets,
+    CLIP_MT_track_color_presets,
+    CLIP_MT_tracking_settings_presets,
+    CLIP_MT_track_color_specials,
+    CLIP_MT_stabilize_2d_specials,
+    CLIP_MT_stabilize_2d_rotation_specials,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_console.py b/release/scripts/startup/bl_ui/space_console.py
index 4266abc5ad7..e6f96d989c7 100644
--- a/release/scripts/startup/bl_ui/space_console.py
+++ b/release/scripts/startup/bl_ui/space_console.py
@@ -102,5 +102,15 @@ def add_scrollback(text, text_type):
         bpy.ops.console.scrollback_append(text=l.expandtabs(4),
                                           type=text_type)
 
+
+classes = (
+    CONSOLE_HT_header,
+    CONSOLE_MT_editor_menus,
+    CONSOLE_MT_console,
+    CONSOLE_MT_language,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_dopesheet.py b/release/scripts/startup/bl_ui/space_dopesheet.py
index 4d365c8dc08..e02649219dd 100644
--- a/release/scripts/startup/bl_ui/space_dopesheet.py
+++ b/release/scripts/startup/bl_ui/space_dopesheet.py
@@ -449,5 +449,21 @@ class DOPESHEET_MT_delete(Menu):
         layout.operator("action.clean", text="Clean Channels").channels = True
 
 
+classes = (
+    DOPESHEET_HT_header,
+    DOPESHEET_MT_editor_menus,
+    DOPESHEET_MT_view,
+    DOPESHEET_MT_select,
+    DOPESHEET_MT_marker,
+    DOPESHEET_MT_channel,
+    DOPESHEET_MT_key,
+    DOPESHEET_MT_key_transform,
+    DOPESHEET_MT_gpencil_channel,
+    DOPESHEET_MT_gpencil_frame,
+    DOPESHEET_MT_delete,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_filebrowser.py b/release/scripts/startup/bl_ui/space_filebrowser.py
index d7b93dc36f0..bd72dfa9370 100644
--- a/release/scripts/startup/bl_ui/space_filebrowser.py
+++ b/release/scripts/startup/bl_ui/space_filebrowser.py
@@ -247,5 +247,18 @@ class FILEBROWSER_PT_advanced_filter(Panel):
                 col.prop(params, "filter_id")
 
 
+classes = (
+    FILEBROWSER_HT_header,
+    FILEBROWSER_UL_dir,
+    FILEBROWSER_PT_system_folders,
+    FILEBROWSER_PT_system_bookmarks,
+    FILEBROWSER_MT_bookmarks_specials,
+    FILEBROWSER_PT_bookmarks,
+    FILEBROWSER_PT_recent_folders,
+    FILEBROWSER_PT_advanced_filter,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_graph.py b/release/scripts/startup/bl_ui/space_graph.py
index 446df9e6e79..21b49df61ae 100644
--- a/release/scripts/startup/bl_ui/space_graph.py
+++ b/release/scripts/startup/bl_ui/space_graph.py
@@ -42,10 +42,11 @@ class GRAPH_HT_header(Header):
 
         dopesheet_filter(layout, context)
 
-        layout.prop(st, "use_normalization", text="Normalize")
-        row = layout.row()
-        row.active = st.use_normalization
-        row.prop(st, "use_auto_normalization", text="Auto")
+        row = layout.row(align=True)
+        row.prop(st, "use_normalization", icon='NORMALIZE_FCURVES', text="Normalize", toggle=True)
+        sub = row.row(align=True)
+        sub.active = st.use_normalization
+        sub.prop(st, "use_auto_normalization", icon='FILE_REFRESH', text="", toggle=True)
 
         row = layout.row(align=True)
 
@@ -298,6 +299,19 @@ class GRAPH_MT_delete(Menu):
         layout.operator("graph.clean").channels = False
         layout.operator("graph.clean", text="Clean Channels").channels = True
 
+classes = (
+    GRAPH_HT_header,
+    GRAPH_MT_editor_menus,
+    GRAPH_MT_view,
+    GRAPH_MT_select,
+    GRAPH_MT_marker,
+    GRAPH_MT_channel,
+    GRAPH_MT_key,
+    GRAPH_MT_key_transform,
+    GRAPH_MT_delete,
+)
 
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_image.py b/release/scripts/startup/bl_ui/space_image.py
index 04b4cef9512..b274657b4f4 100644
--- a/release/scripts/startup/bl_ui/space_image.py
+++ b/release/scripts/startup/bl_ui/space_image.py
@@ -1242,5 +1242,59 @@ class IMAGE_PT_tools_grease_pencil_brushcurves(GreasePencilBrushCurvesPanel, Pan
     bl_space_type = 'IMAGE_EDITOR'
 
 
+classes = (
+    IMAGE_MT_view,
+    IMAGE_MT_select,
+    IMAGE_MT_brush,
+    IMAGE_MT_image,
+    IMAGE_MT_image_invert,
+    IMAGE_MT_uvs,
+    IMAGE_MT_uvs_showhide,
+    IMAGE_MT_uvs_proportional,
+    IMAGE_MT_uvs_transform,
+    IMAGE_MT_uvs_snap,
+    IMAGE_MT_uvs_mirror,
+    IMAGE_MT_uvs_weldalign,
+    IMAGE_MT_uvs_select_mode,
+    IMAGE_HT_header,
+    MASK_MT_editor_menus,
+    IMAGE_PT_mask,
+    IMAGE_PT_mask_layers,
+    IMAGE_PT_mask_display,
+    IMAGE_PT_active_mask_spline,
+    IMAGE_PT_active_mask_point,
+    IMAGE_PT_image_properties,
+    IMAGE_PT_game_properties,
+    IMAGE_PT_view_properties,
+    IMAGE_PT_tools_transform_uvs,
+    IMAGE_PT_paint,
+    IMAGE_PT_tools_brush_overlay,
+    IMAGE_PT_tools_brush_texture,
+    IMAGE_PT_tools_mask,
+    IMAGE_PT_tools_mask_texture,
+    IMAGE_PT_tools_brush_tool,
+    IMAGE_PT_paint_stroke,
+    IMAGE_PT_paint_curve,
+    IMAGE_PT_tools_imagepaint_symmetry,
+    IMAGE_PT_tools_brush_appearance,
+    IMAGE_PT_tools_paint_options,
+    IMAGE_UV_sculpt,
+    IMAGE_UV_sculpt_curve,
+    IMAGE_PT_view_histogram,
+    IMAGE_PT_view_waveform,
+    IMAGE_PT_view_vectorscope,
+    IMAGE_PT_sample_line,
+    IMAGE_PT_scope_sample,
+    IMAGE_PT_grease_pencil,
+    IMAGE_PT_grease_pencil_palettecolor,
+    IMAGE_PT_tools_grease_pencil_draw,
+    IMAGE_PT_tools_grease_pencil_edit,
+    IMAGE_PT_tools_grease_pencil_sculpt,
+    IMAGE_PT_tools_grease_pencil_brush,
+    IMAGE_PT_tools_grease_pencil_brushcurves,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_info.py b/release/scripts/startup/bl_ui/space_info.py
index 2a085b8aada..a7b518dfd2e 100644
--- a/release/scripts/startup/bl_ui/space_info.py
+++ b/release/scripts/startup/bl_ui/space_info.py
@@ -127,6 +127,18 @@ class INFO_MT_file(Menu):
         layout.operator("wm.save_homefile", icon='SAVE_PREFS')
         layout.operator("wm.read_factory_settings", icon='LOAD_FACTORY')
 
+        if any(bpy.utils.app_template_paths()):
+            app_template = context.user_preferences.app_template
+            if app_template:
+                layout.operator(
+                    "wm.read_factory_settings",
+                    text="Load Factory Template Settings",
+                    icon='LOAD_FACTORY',
+                ).app_template = app_template
+            del app_template
+
+        layout.menu("USERPREF_MT_app_templates", icon='FILE_BLEND')
+
         layout.separator()
 
         layout.operator_context = 'INVOKE_AREA'
@@ -340,5 +352,23 @@ class INFO_MT_help(Menu):
 
         layout.operator("wm.splash", icon='BLENDER')
 
+
+classes = (
+    INFO_HT_header,
+    INFO_MT_editor_menus,
+    INFO_MT_file,
+    INFO_MT_file_import,
+    INFO_MT_file_export,
+    INFO_MT_file_external_data,
+    INFO_MT_file_previews,
+    INFO_MT_game,
+    INFO_MT_render,
+    INFO_MT_opengl_render,
+    INFO_MT_window,
+    INFO_MT_help,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_logic.py b/release/scripts/startup/bl_ui/space_logic.py
index 1b316a3eede..b552181f491 100644
--- a/release/scripts/startup/bl_ui/space_logic.py
+++ b/release/scripts/startup/bl_ui/space_logic.py
@@ -130,5 +130,16 @@ class LOGIC_MT_view(Menu):
         layout.operator("screen.screen_full_area")
         layout.operator("screen.screen_full_area", text="Toggle Fullscreen Area").use_hide_panels = True
 
+
+classes = (
+    LOGIC_PT_properties,
+    LOGIC_MT_logicbricks_add,
+    LOGIC_HT_header,
+    LOGIC_MT_editor_menus,
+    LOGIC_MT_view,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_nla.py b/release/scripts/startup/bl_ui/space_nla.py
index 8fbf9bfc6ac..65d020477dc 100644
--- a/release/scripts/startup/bl_ui/space_nla.py
+++ b/release/scripts/startup/bl_ui/space_nla.py
@@ -209,5 +209,19 @@ class NLA_MT_edit_transform(Menu):
         layout.operator("transform.transform", text="Extend").mode = 'TIME_EXTEND'
         layout.operator("transform.transform", text="Scale").mode = 'TIME_SCALE'
 
+
+classes = (
+    NLA_HT_header,
+    NLA_MT_edit,
+    NLA_MT_editor_menus,
+    NLA_MT_view,
+    NLA_MT_select,
+    NLA_MT_marker,
+    NLA_MT_add,
+    NLA_MT_edit_transform,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_node.py b/release/scripts/startup/bl_ui/space_node.py
index 8821fa0ca58..b939d34e154 100644
--- a/release/scripts/startup/bl_ui/space_node.py
+++ b/release/scripts/startup/bl_ui/space_node.py
@@ -526,6 +526,33 @@ class NODE_PT_tools_grease_pencil_brushcurves(GreasePencilBrushCurvesPanel, Pane
 def node_draw_tree_view(layout, context):
     pass
 
+classes = (
+    NODE_HT_header,
+    NODE_MT_editor_menus,
+    NODE_MT_add,
+    NODE_MT_view,
+    NODE_MT_select,
+    NODE_MT_node,
+    NODE_MT_node_color_presets,
+    NODE_MT_node_color_specials,
+    NODE_PT_active_node_generic,
+    NODE_PT_active_node_color,
+    NODE_PT_active_node_properties,
+    NODE_PT_backdrop,
+    NODE_PT_quality,
+    NODE_UL_interface_sockets,
+    NODE_PT_grease_pencil,
+    NODE_PT_grease_pencil_palettecolor,
+    NODE_PT_grease_pencil_tools,
+    NODE_PT_tools_grease_pencil_draw,
+    NODE_PT_tools_grease_pencil_edit,
+    NODE_PT_tools_grease_pencil_sculpt,
+    NODE_PT_tools_grease_pencil_brush,
+    NODE_PT_tools_grease_pencil_brushcurves,
+)
+
 
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_outliner.py b/release/scripts/startup/bl_ui/space_outliner.py
index 6f7c535fe21..4eb550bbb35 100644
--- a/release/scripts/startup/bl_ui/space_outliner.py
+++ b/release/scripts/startup/bl_ui/space_outliner.py
@@ -130,5 +130,16 @@ class OUTLINER_MT_edit_datablocks(Menu):
         layout.operator("outliner.drivers_add_selected")
         layout.operator("outliner.drivers_delete_selected")
 
+
+classes = (
+    OUTLINER_HT_header,
+    OUTLINER_MT_editor_menus,
+    OUTLINER_MT_view,
+    OUTLINER_MT_search,
+    OUTLINER_MT_edit_datablocks,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_properties.py b/release/scripts/startup/bl_ui/space_properties.py
index f0ca01ec3ba..20cb5719e20 100644
--- a/release/scripts/startup/bl_ui/space_properties.py
+++ b/release/scripts/startup/bl_ui/space_properties.py
@@ -34,13 +34,11 @@ class PROPERTIES_HT_header(Header):
         row.prop(view, "context", expand=True, icon_only=True)
 
 
-def register():
-    bpy.utils.register_module(__name__)
-
-
-def unregister():
-    bpy.utils.unregister_module(__name__)
-
-
-if __name__ == "__main__":
-    register()
+classes = (
+    PROPERTIES_HT_header,
+)
+
+if __name__ == "__main__":  # only for live edit.
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_sequencer.py b/release/scripts/startup/bl_ui/space_sequencer.py
index 6bb516cf929..01da3b48f7d 100644
--- a/release/scripts/startup/bl_ui/space_sequencer.py
+++ b/release/scripts/startup/bl_ui/space_sequencer.py
@@ -652,17 +652,39 @@ class SEQUENCER_PT_effect(SequencerButtonsPanel, Panel):
             col.prop(strip, "rotation_start", text="Rotation")
 
         elif strip.type == 'MULTICAM':
-            layout.prop(strip, "multicam_source")
-
-            row = layout.row(align=True)
-            sub = row.row(align=True)
-            sub.scale_x = 2.0
-
-            sub.operator("screen.animation_play", text="", icon='PAUSE' if context.screen.is_animation_playing else 'PLAY')
+            col = layout.column(align=True)
+            strip_channel = strip.channel
+
+            col.prop(strip, "multicam_source", text="Source Channel")
+
+            # The multicam strip needs at least 2 strips to be useful
+            if strip_channel > 2:
+                BT_ROW = 4
+
+                col.label("Cut To:")
+                row = col.row()
+
+                for i in range(1, strip_channel):
+                    if (i % BT_ROW) == 1:
+                        row = col.row(align=True)
+
+                    # Workaround - .enabled has to have a separate UI block to work
+                    if i == strip.multicam_source:
+                        sub = row.row(align=True)
+                        sub.enabled = False
+                        sub.operator("sequencer.cut_multicam", text="%d" % i).camera = i
+                    else:
+                        sub_1 = row.row(align=True)
+                        sub_1.enabled = True
+                        sub_1.operator("sequencer.cut_multicam", text="%d" % i).camera = i
+
+                if strip.channel > BT_ROW and (strip_channel - 1) % BT_ROW:
+                    for i in range(strip.channel, strip_channel + ((BT_ROW + 1 - strip_channel) % BT_ROW)):
+                        row.label("")
+            else:
+                col.separator()
+                col.label(text="Two or more channels are needed below this strip", icon="INFO")
 
-            row.label("Cut To")
-            for i in range(1, strip.channel):
-                row.operator("sequencer.cut_multicam", text="%d" % i).camera = i
 
         elif strip.type == 'TEXT':
             col = layout.column()
@@ -1217,5 +1239,37 @@ class SEQUENCER_PT_custom_props(SequencerButtonsPanel, PropertyPanel, Panel):
     bl_category = "Strip"
 
 
+classes = (
+    SEQUENCER_HT_header,
+    SEQUENCER_MT_editor_menus,
+    SEQUENCER_MT_view,
+    SEQUENCER_MT_view_toggle,
+    SEQUENCER_MT_select,
+    SEQUENCER_MT_marker,
+    SEQUENCER_MT_change,
+    SEQUENCER_MT_frame,
+    SEQUENCER_MT_add,
+    SEQUENCER_MT_add_effect,
+    SEQUENCER_MT_strip,
+    SEQUENCER_PT_edit,
+    SEQUENCER_PT_effect,
+    SEQUENCER_PT_input,
+    SEQUENCER_PT_sound,
+    SEQUENCER_PT_scene,
+    SEQUENCER_PT_mask,
+    SEQUENCER_PT_filter,
+    SEQUENCER_PT_proxy,
+    SEQUENCER_PT_preview,
+    SEQUENCER_PT_view,
+    SEQUENCER_PT_view_safe_areas,
+    SEQUENCER_PT_modifiers,
+    SEQUENCER_PT_grease_pencil,
+    SEQUENCER_PT_grease_pencil_palettecolor,
+    SEQUENCER_PT_grease_pencil_tools,
+    SEQUENCER_PT_custom_props,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_text.py b/release/scripts/startup/bl_ui/space_text.py
index 1fd10575e07..98a7da855ca 100644
--- a/release/scripts/startup/bl_ui/space_text.py
+++ b/release/scripts/startup/bl_ui/space_text.py
@@ -215,20 +215,22 @@ class TEXT_MT_templates_py(Menu):
     bl_label = "Python"
 
     def draw(self, context):
-        self.path_menu(bpy.utils.script_paths("templates_py"),
-                       "text.open",
-                       {"internal": True},
-                       )
+        self.path_menu(
+            bpy.utils.script_paths("templates_py"),
+            "text.open",
+            props_default={"internal": True},
+        )
 
 
 class TEXT_MT_templates_osl(Menu):
     bl_label = "Open Shading Language"
 
     def draw(self, context):
-        self.path_menu(bpy.utils.script_paths("templates_osl"),
-                       "text.open",
-                       {"internal": True},
-                       )
+        self.path_menu(
+            bpy.utils.script_paths("templates_osl"),
+            "text.open",
+            props_default={"internal": True},
+        )
 
 
 class TEXT_MT_templates(Menu):
@@ -341,5 +343,25 @@ class TEXT_MT_toolbox(Menu):
 
         layout.operator("text.run_script")
 
+
+classes = (
+    TEXT_HT_header,
+    TEXT_MT_edit,
+    TEXT_MT_editor_menus,
+    TEXT_PT_properties,
+    TEXT_PT_find,
+    TEXT_MT_view,
+    TEXT_MT_text,
+    TEXT_MT_templates,
+    TEXT_MT_templates_py,
+    TEXT_MT_templates_osl,
+    TEXT_MT_edit_select,
+    TEXT_MT_format,
+    TEXT_MT_edit_to3d,
+    TEXT_MT_toolbox,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_time.py b/release/scripts/startup/bl_ui/space_time.py
index 508e62e4f56..b9a25cd72a0 100644
--- a/release/scripts/startup/bl_ui/space_time.py
+++ b/release/scripts/startup/bl_ui/space_time.py
@@ -49,7 +49,10 @@ class TIME_HT_header(Header):
             row.prop(scene, "frame_preview_start", text="Start")
             row.prop(scene, "frame_preview_end", text="End")
 
-        layout.prop(scene, "frame_current", text="")
+        if scene.show_subframe:
+            layout.prop(scene, "frame_float", text="")
+        else:
+            layout.prop(scene, "frame_current", text="")
 
         layout.separator()
 
@@ -135,6 +138,7 @@ class TIME_MT_view(Menu):
 
         layout.prop(st, "show_frame_indicator")
         layout.prop(scene, "show_keys_from_selected_only")
+        layout.prop(scene, "show_subframe")
 
         layout.separator()
 
@@ -265,5 +269,18 @@ def marker_menu_generic(layout):
     layout.prop(ts, "lock_markers")
 
 
+classes = (
+    TIME_HT_header,
+    TIME_MT_editor_menus,
+    TIME_MT_marker,
+    TIME_MT_view,
+    TIME_MT_cache,
+    TIME_MT_frame,
+    TIME_MT_playback,
+    TIME_MT_autokey,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_userpref.py b/release/scripts/startup/bl_ui/space_userpref.py
index 075a6f870fa..e50beba50d8 100644
--- a/release/scripts/startup/bl_ui/space_userpref.py
+++ b/release/scripts/startup/bl_ui/space_userpref.py
@@ -90,6 +90,63 @@ class USERPREF_MT_interaction_presets(Menu):
     draw = Menu.draw_preset
 
 
+class USERPREF_MT_app_templates(Menu):
+    bl_label = "Application Templates"
+    preset_subdir = "app_templates"
+
+    def draw_ex(self, context, *, use_splash=False, use_default=False, use_install=False):
+        import os
+
+        layout = self.layout
+
+        # now draw the presets
+        layout.operator_context = 'EXEC_DEFAULT'
+
+        if use_default:
+            props = layout.operator("wm.read_homefile", text="Default")
+            props.use_splash = True
+            props.app_template = ""
+            layout.separator()
+
+        template_paths = bpy.utils.app_template_paths()
+
+        # expand template paths
+        app_templates = []
+        for path in template_paths:
+            for d in os.listdir(path):
+                if d.startswith(("__", ".")):
+                    continue
+                template = os.path.join(path, d)
+                if os.path.isdir(template):
+                    # template_paths_expand.append(template)
+                    app_templates.append(d)
+
+        for d in sorted(app_templates):
+            props = layout.operator(
+                "wm.read_homefile",
+                text=bpy.path.display_name(d),
+            )
+            props.use_splash = True
+            props.app_template = d;
+
+        if use_install:
+            layout.separator()
+            layout.operator_context = 'INVOKE_DEFAULT'
+            props = layout.operator("wm.app_template_install")
+
+
+    def draw(self, context):
+        self.draw_ex(context, use_splash=False, use_default=True, use_install=True)
+
+
+class USERPREF_MT_templates_splash(Menu):
+    bl_label = "Startup Templates"
+    preset_subdir = "templates"
+
+    def draw(self, context):
+        USERPREF_MT_app_templates.draw_ex(self, context, use_splash=True, use_default=True)
+
+
 class USERPREF_MT_appconfigs(Menu):
     bl_label = "AppPresets"
     preset_subdir = "keyconfig"
@@ -110,7 +167,17 @@ class USERPREF_MT_splash(Menu):
 
         split = layout.split()
         row = split.row()
-        row.label("")
+
+        if any(bpy.utils.app_template_paths()):
+            row.label("Template:")
+            template = context.user_preferences.app_template
+            row.menu(
+                "USERPREF_MT_templates_splash",
+                text=bpy.path.display_name(template) if template else "Default",
+            )
+        else:
+            row.label("")
+
         row = split.row()
         row.label("Interaction:")
 
@@ -150,6 +217,7 @@ class USERPREF_PT_interface(Panel):
 
         col = row.column()
         col.label(text="Display:")
+        col.prop(view, "ui_scale", text="Scale")
         col.prop(view, "show_tooltips")
         col.prop(view, "show_tooltips_python")
         col.prop(view, "show_object_info", text="Object Info")
@@ -400,11 +468,6 @@ class USERPREF_PT_system(Panel):
 
         col = colsplit.column()
         col.label(text="General:")
-        col.prop(system, "dpi")
-        col.label("Virtual Pixel Mode:")
-        col.prop(system, "virtual_pixel_mode", text="")
-
-        col.separator()
 
         col.prop(system, "frame_server_port")
         col.prop(system, "scrollback", text="Console Scrollback")
@@ -453,6 +516,7 @@ class USERPREF_PT_system(Panel):
         col.separator()
         col.label(text="Selection")
         col.prop(system, "select_method", text="")
+        col.prop(system, "use_select_pick_depth")
 
         col.separator()
 
@@ -1243,7 +1307,7 @@ class USERPREF_MT_addons_online_resources(Menu):
                 "wm.url_open", text="API Concepts", icon='URL',
                 ).url = bpy.types.WM_OT_doc_view._prefix + "/info_quickstart.html"
         layout.operator("wm.url_open", text="Add-on Tutorial", icon='URL',
-                ).url = "http://www.blender.org/api/blender_python_api_current/info_tutorial_addon.html"
+                ).url = bpy.types.WM_OT_doc_view._prefix + "/info_tutorial_addon.html"
 
 
 class USERPREF_PT_addons(Panel):
@@ -1317,11 +1381,18 @@ class USERPREF_PT_addons(Panel):
 
         # set in addon_utils.modules_refresh()
         if addon_utils.error_duplicates:
-            self.draw_error(col,
-                            "Multiple addons using the same name found!\n"
-                            "likely a problem with the script search path.\n"
-                            "(see console for details)",
-                            )
+            box = col.box()
+            row = box.row()
+            row.label("Multiple add-ons with the same name found!")
+            row.label(icon='ERROR')
+            box.label("Please delete one of each pair:")
+            for (addon_name, addon_file, addon_path) in addon_utils.error_duplicates:
+                box.separator()
+                sub_col = box.column(align=True)
+                sub_col.label(addon_name + ":")
+                sub_col.label("    " + addon_file)
+                sub_col.label("    " + addon_path)
+
 
         if addon_utils.error_encoding:
             self.draw_error(col,
@@ -1473,5 +1544,29 @@ class USERPREF_PT_addons(Panel):
                 row.label(text=module_name, translate=False)
 
 
+classes = (
+    USERPREF_HT_header,
+    USERPREF_PT_tabs,
+    USERPREF_MT_interaction_presets,
+    USERPREF_MT_templates_splash,
+    USERPREF_MT_app_templates,
+    USERPREF_MT_appconfigs,
+    USERPREF_MT_splash,
+    USERPREF_MT_splash_footer,
+    USERPREF_PT_interface,
+    USERPREF_PT_edit,
+    USERPREF_PT_system,
+    USERPREF_MT_interface_theme_presets,
+    USERPREF_PT_theme,
+    USERPREF_PT_file,
+    USERPREF_MT_ndof_settings,
+    USERPREF_MT_keyconfigs,
+    USERPREF_PT_input,
+    USERPREF_MT_addons_online_resources,
+    USERPREF_PT_addons,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_view3d.py b/release/scripts/startup/bl_ui/space_view3d.py
index 1183d078a8c..f13c7095f67 100644
--- a/release/scripts/startup/bl_ui/space_view3d.py
+++ b/release/scripts/startup/bl_ui/space_view3d.py
@@ -284,7 +284,8 @@ class VIEW3D_MT_transform_object(VIEW3D_MT_transform_base):
         layout.separator()
 
         layout.operator_context = 'EXEC_REGION_WIN'
-        layout.operator("transform.transform", text="Align to Transform Orientation").mode = 'ALIGN'  # XXX see alignmenu() in edit.c of b2.4x to get this working
+        # XXX see alignmenu() in edit.c of b2.4x to get this working
+        layout.operator("transform.transform", text="Align to Transform Orientation").mode = 'ALIGN'
 
         layout.separator()
 
@@ -1205,6 +1206,16 @@ class INFO_MT_lamp_add(Menu):
         layout.operator_enum("object.lamp_add", "type")
 
 
+class INFO_MT_camera_add(Menu):
+    bl_idname = "INFO_MT_camera_add"
+    bl_label = "Camera"
+
+    def draw(self, context):
+        layout = self.layout
+        layout.operator_context = 'EXEC_REGION_WIN'
+        layout.operator("object.camera_add", text="Camera", icon='OUTLINER_OB_CAMERA')
+
+
 class INFO_MT_add(Menu):
     bl_label = "Add"
 
@@ -1236,7 +1247,11 @@ class INFO_MT_add(Menu):
         layout.operator("object.speaker_add", text="Speaker", icon='OUTLINER_OB_SPEAKER')
         layout.separator()
 
-        layout.operator("object.camera_add", text="Camera", icon='OUTLINER_OB_CAMERA')
+        if INFO_MT_camera_add.is_extended():
+            layout.menu("INFO_MT_camera_add", icon='OUTLINER_OB_CAMERA')
+        else:
+            INFO_MT_camera_add.draw(self, context)
+
         layout.menu("INFO_MT_lamp_add", icon='OUTLINER_OB_LAMP')
         layout.separator()
 
@@ -1533,6 +1548,7 @@ class VIEW3D_MT_object_apply(Menu):
         layout.separator()
 
         layout.operator("object.visual_transform_apply", text="Visual Transform", text_ctxt=i18n_contexts.default)
+        layout.operator("object.convert", text="Visual Geometry to Mesh", text_ctxt=i18n_contexts.default).target = 'MESH'
         layout.operator("object.duplicates_make_real")
 
 
@@ -1744,6 +1760,7 @@ class VIEW3D_MT_brush_paint_modes(Menu):
         layout.prop(brush, "use_paint_weight", text="Weight Paint")
         layout.prop(brush, "use_paint_image", text="Texture Paint")
 
+
 # ********** Vertex paint menu **********
 
 
@@ -1813,6 +1830,7 @@ class VIEW3D_MT_vertex_group(Menu):
             layout.operator("object.vertex_group_remove", text="Remove Active Group").all = False
             layout.operator("object.vertex_group_remove", text="Remove All Groups").all = True
 
+
 # ********** Weight paint menu **********
 
 
@@ -1851,6 +1869,7 @@ class VIEW3D_MT_paint_weight(Menu):
 
         layout.operator("paint.weight_set")
 
+
 # ********** Sculpt menu **********
 
 
@@ -2004,6 +2023,7 @@ class VIEW3D_MT_particle_specials(Menu):
 class VIEW3D_MT_particle_showhide(ShowHideMenu, Menu):
     _operator_name = "particle"
 
+
 # ********** Pose Menu **********
 
 
@@ -2277,6 +2297,7 @@ class VIEW3D_MT_bone_options_disable(Menu, BoneOptions):
     bl_label = "Disable Bone Options"
     type = 'DISABLE'
 
+
 # ********** Edit Menus, suffix from ob.type **********
 
 
@@ -2444,6 +2465,7 @@ class VIEW3D_MT_edit_mesh_vertices(Menu):
         with_bullet = bpy.app.build_options.bullet
 
         layout.operator("mesh.merge")
+        layout.operator("mesh.remove_doubles")
         layout.operator("mesh.rip_move")
         layout.operator("mesh.rip_move_fill")
         layout.operator("mesh.rip_edge_move")
@@ -2466,7 +2488,6 @@ class VIEW3D_MT_edit_mesh_vertices(Menu):
         if with_bullet:
             layout.operator("mesh.convex_hull")
         layout.operator("mesh.vertices_smooth")
-        layout.operator("mesh.remove_doubles")
 
         layout.operator("mesh.blend_from_shape")
 
@@ -2623,6 +2644,7 @@ class VIEW3D_MT_edit_mesh_clean(Menu):
         layout.operator("mesh.face_make_planar")
         layout.operator("mesh.vert_connect_nonplanar")
         layout.operator("mesh.vert_connect_concave")
+        layout.operator("mesh.remove_doubles")
         layout.operator("mesh.fill_holes")
 
 
@@ -3780,13 +3802,143 @@ class VIEW3D_PT_context_properties(Panel):
             # Draw with no edit button
             rna_prop_ui.draw(self.layout, context, member, object, False)
 
-
-def register():
-    bpy.utils.register_module(__name__)
-
-
-def unregister():
-    bpy.utils.unregister_module(__name__)
-
-if __name__ == "__main__":
-    register()
+classes = (
+    VIEW3D_HT_header,
+    VIEW3D_MT_editor_menus,
+    VIEW3D_MT_transform,
+    VIEW3D_MT_transform_base,
+    VIEW3D_MT_transform_object,
+    VIEW3D_MT_transform_armature,
+    VIEW3D_MT_mirror,
+    VIEW3D_MT_snap,
+    VIEW3D_MT_uv_map,
+    VIEW3D_MT_edit_proportional,
+    VIEW3D_MT_view,
+    VIEW3D_MT_view_navigation,
+    VIEW3D_MT_view_align,
+    VIEW3D_MT_view_align_selected,
+    VIEW3D_MT_view_cameras,
+    VIEW3D_MT_select_object,
+    VIEW3D_MT_select_object_more_less,
+    VIEW3D_MT_select_pose,
+    VIEW3D_MT_select_pose_more_less,
+    VIEW3D_MT_select_particle,
+    VIEW3D_MT_edit_mesh,
+    VIEW3D_MT_edit_mesh_select_similar,
+    VIEW3D_MT_edit_mesh_select_by_trait,
+    VIEW3D_MT_edit_mesh_select_more_less,
+    VIEW3D_MT_select_edit_mesh,
+    VIEW3D_MT_select_edit_curve,
+    VIEW3D_MT_select_edit_surface,
+    VIEW3D_MT_select_edit_text,
+    VIEW3D_MT_select_edit_metaball,
+    VIEW3D_MT_select_edit_lattice,
+    VIEW3D_MT_select_edit_armature,
+    VIEW3D_MT_select_gpencil,
+    VIEW3D_MT_select_paint_mask,
+    VIEW3D_MT_select_paint_mask_vertex,
+    VIEW3D_MT_angle_control,
+    INFO_MT_mesh_add,
+    INFO_MT_curve_add,
+    INFO_MT_surface_add,
+    INFO_MT_metaball_add,
+    INFO_MT_edit_curve_add,
+    INFO_MT_edit_armature_add,
+    INFO_MT_armature_add,
+    INFO_MT_lamp_add,
+    INFO_MT_camera_add,
+    INFO_MT_add,
+    VIEW3D_MT_object,
+    VIEW3D_MT_object_animation,
+    VIEW3D_MT_object_clear,
+    VIEW3D_MT_object_specials,
+    VIEW3D_MT_object_apply,
+    VIEW3D_MT_object_parent,
+    VIEW3D_MT_object_track,
+    VIEW3D_MT_object_group,
+    VIEW3D_MT_object_constraints,
+    VIEW3D_MT_object_quick_effects,
+    VIEW3D_MT_object_showhide,
+    VIEW3D_MT_make_single_user,
+    VIEW3D_MT_make_links,
+    VIEW3D_MT_object_game,
+    VIEW3D_MT_brush,
+    VIEW3D_MT_brush_paint_modes,
+    VIEW3D_MT_paint_vertex,
+    VIEW3D_MT_hook,
+    VIEW3D_MT_vertex_group,
+    VIEW3D_MT_paint_weight,
+    VIEW3D_MT_sculpt,
+    VIEW3D_MT_hide_mask,
+    VIEW3D_MT_particle,
+    VIEW3D_MT_particle_specials,
+    VIEW3D_MT_particle_showhide,
+    VIEW3D_MT_pose,
+    VIEW3D_MT_pose_transform,
+    VIEW3D_MT_pose_slide,
+    VIEW3D_MT_pose_propagate,
+    VIEW3D_MT_pose_library,
+    VIEW3D_MT_pose_motion,
+    VIEW3D_MT_pose_group,
+    VIEW3D_MT_pose_ik,
+    VIEW3D_MT_pose_constraints,
+    VIEW3D_MT_pose_showhide,
+    VIEW3D_MT_pose_apply,
+    VIEW3D_MT_pose_specials,
+    VIEW3D_MT_bone_options_toggle,
+    VIEW3D_MT_bone_options_enable,
+    VIEW3D_MT_bone_options_disable,
+    VIEW3D_MT_edit_mesh_specials,
+    VIEW3D_MT_edit_mesh_select_mode,
+    VIEW3D_MT_edit_mesh_extrude,
+    VIEW3D_MT_edit_mesh_vertices,
+    VIEW3D_MT_edit_mesh_edges,
+    VIEW3D_MT_edit_mesh_faces,
+    VIEW3D_MT_edit_mesh_normals,
+    VIEW3D_MT_edit_mesh_clean,
+    VIEW3D_MT_edit_mesh_delete,
+    VIEW3D_MT_edit_mesh_showhide,
+    VIEW3D_MT_edit_gpencil,
+    VIEW3D_MT_edit_gpencil_delete,
+    VIEW3D_MT_edit_curve,
+    VIEW3D_MT_edit_curve_ctrlpoints,
+    VIEW3D_MT_edit_curve_segments,
+    VIEW3D_MT_edit_curve_specials,
+    VIEW3D_MT_edit_curve_delete,
+    VIEW3D_MT_edit_curve_showhide,
+    VIEW3D_MT_edit_surface,
+    VIEW3D_MT_edit_font,
+    VIEW3D_MT_edit_text_chars,
+    VIEW3D_MT_edit_meta,
+    VIEW3D_MT_edit_meta_showhide,
+    VIEW3D_MT_edit_lattice,
+    VIEW3D_MT_edit_armature,
+    VIEW3D_MT_armature_specials,
+    VIEW3D_MT_edit_armature_parent,
+    VIEW3D_MT_edit_armature_roll,
+    VIEW3D_MT_edit_armature_delete,
+    VIEW3D_MT_edit_gpencil_transform,
+    VIEW3D_MT_edit_gpencil_interpolate,
+    VIEW3D_PT_grease_pencil,
+    VIEW3D_PT_grease_pencil_palettecolor,
+    VIEW3D_PT_view3d_properties,
+    VIEW3D_PT_view3d_cursor,
+    VIEW3D_PT_view3d_name,
+    VIEW3D_PT_view3d_display,
+    VIEW3D_PT_view3d_stereo,
+    VIEW3D_PT_view3d_shading,
+    VIEW3D_PT_view3d_motion_tracking,
+    VIEW3D_PT_view3d_meshdisplay,
+    VIEW3D_PT_view3d_meshstatvis,
+    VIEW3D_PT_view3d_curvedisplay,
+    VIEW3D_PT_background_image,
+    VIEW3D_PT_transform_orientations,
+    VIEW3D_PT_etch_a_ton,
+    VIEW3D_PT_context_properties,
+)
+
+
+if __name__ == "__main__":  # only for live edit.
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/bl_ui/space_view3d_toolbar.py b/release/scripts/startup/bl_ui/space_view3d_toolbar.py
index 60e86d7544d..d58453deaef 100644
--- a/release/scripts/startup/bl_ui/space_view3d_toolbar.py
+++ b/release/scripts/startup/bl_ui/space_view3d_toolbar.py
@@ -2008,5 +2008,69 @@ class VIEW3D_PT_tools_history(View3DPanel, Panel):
         col.operator("screen.repeat_history", text="History...")
 
 
+classes = (
+    VIEW3D_PT_tools_transform,
+    VIEW3D_PT_tools_object,
+    VIEW3D_PT_tools_add_object,
+    VIEW3D_PT_tools_relations,
+    VIEW3D_PT_tools_animation,
+    VIEW3D_PT_tools_rigid_body,
+    VIEW3D_PT_tools_transform_mesh,
+    VIEW3D_PT_tools_meshedit,
+    VIEW3D_PT_tools_meshweight,
+    VIEW3D_PT_tools_add_mesh_edit,
+    VIEW3D_PT_tools_shading,
+    VIEW3D_PT_tools_uvs,
+    VIEW3D_PT_tools_meshedit_options,
+    VIEW3D_PT_tools_transform_curve,
+    VIEW3D_PT_tools_curveedit,
+    VIEW3D_PT_tools_add_curve_edit,
+    VIEW3D_PT_tools_curveedit_options_stroke,
+    VIEW3D_PT_tools_transform_surface,
+    VIEW3D_PT_tools_surfaceedit,
+    VIEW3D_PT_tools_add_surface_edit,
+    VIEW3D_PT_tools_textedit,
+    VIEW3D_PT_tools_armatureedit,
+    VIEW3D_PT_tools_armatureedit_transform,
+    VIEW3D_PT_tools_armatureedit_options,
+    VIEW3D_PT_tools_mballedit,
+    VIEW3D_PT_tools_add_mball_edit,
+    VIEW3D_PT_tools_latticeedit,
+    VIEW3D_PT_tools_posemode,
+    VIEW3D_PT_tools_posemode_options,
+    VIEW3D_PT_imapaint_tools_missing,
+    VIEW3D_PT_tools_brush,
+    TEXTURE_UL_texpaintslots,
+    VIEW3D_MT_tools_projectpaint_uvlayer,
+    VIEW3D_PT_slots_projectpaint,
+    VIEW3D_PT_stencil_projectpaint,
+    VIEW3D_PT_tools_brush_overlay,
+    VIEW3D_PT_tools_brush_texture,
+    VIEW3D_PT_tools_mask_texture,
+    VIEW3D_PT_tools_brush_stroke,
+    VIEW3D_PT_tools_brush_curve,
+    VIEW3D_PT_sculpt_dyntopo,
+    VIEW3D_PT_sculpt_options,
+    VIEW3D_PT_sculpt_symmetry,
+    VIEW3D_PT_tools_brush_appearance,
+    VIEW3D_PT_tools_weightpaint,
+    VIEW3D_PT_tools_weightpaint_options,
+    VIEW3D_PT_tools_vertexpaint,
+    VIEW3D_PT_tools_imagepaint_external,
+    VIEW3D_PT_tools_imagepaint_symmetry,
+    VIEW3D_PT_tools_projectpaint,
+    VIEW3D_MT_tools_projectpaint_stencil,
+    VIEW3D_PT_tools_particlemode,
+    VIEW3D_PT_tools_grease_pencil_draw,
+    VIEW3D_PT_tools_grease_pencil_edit,
+    VIEW3D_PT_tools_grease_pencil_interpolate,
+    VIEW3D_PT_tools_grease_pencil_sculpt,
+    VIEW3D_PT_tools_grease_pencil_brush,
+    VIEW3D_PT_tools_grease_pencil_brushcurves,
+    VIEW3D_PT_tools_history,
+)
+
 if __name__ == "__main__":  # only for live edit.
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
diff --git a/release/scripts/startup/keyingsets_builtins.py b/release/scripts/startup/keyingsets_builtins.py
index 97320e56378..ce0b1b62d44 100644
--- a/release/scripts/startup/keyingsets_builtins.py
+++ b/release/scripts/startup/keyingsets_builtins.py
@@ -643,13 +643,41 @@ class BUILTIN_KSI_DeltaScale(KeyingSetInfo):
 
 ###############################
 
+classes = (
+    BUILTIN_KSI_Available,
+    BUILTIN_KSI_BendyBones,
+    BUILTIN_KSI_DeltaLocation,
+    BUILTIN_KSI_DeltaRotation,
+    BUILTIN_KSI_DeltaScale,
+    BUILTIN_KSI_LocRot,
+    BUILTIN_KSI_LocRotScale,
+    BUILTIN_KSI_LocScale,
+    BUILTIN_KSI_Location,
+    BUILTIN_KSI_RotScale,
+    BUILTIN_KSI_Rotation,
+    BUILTIN_KSI_Scaling,
+    BUILTIN_KSI_VisualLoc,
+    BUILTIN_KSI_VisualLocRot,
+    BUILTIN_KSI_VisualLocRotScale,
+    BUILTIN_KSI_VisualLocScale,
+    BUILTIN_KSI_VisualRot,
+    BUILTIN_KSI_VisualRotScale,
+    BUILTIN_KSI_VisualScaling,
+    BUILTIN_KSI_WholeCharacter,
+    BUILTIN_KSI_WholeCharacterSelected,
+)
+
 
 def register():
-    bpy.utils.register_module(__name__)
+    from bpy.utils import register_class
+    for cls in classes:
+        register_class(cls)
 
 
 def unregister():
-    bpy.utils.unregister_module(__name__)
+    from bpy.utils import unregister_class
+    for cls in classes:
+        unregister_class(cls)
 
 
 if __name__ == "__main__":
diff --git a/release/scripts/startup/nodeitems_builtins.py b/release/scripts/startup/nodeitems_builtins.py
index 7c0adb3077a..8fd039f2bf7 100644
--- a/release/scripts/startup/nodeitems_builtins.py
+++ b/release/scripts/startup/nodeitems_builtins.py
@@ -19,7 +19,11 @@
 # <pep8 compliant>
 import bpy
 import nodeitems_utils
-from nodeitems_utils import NodeCategory, NodeItem, NodeItemCustom
+from nodeitems_utils import (
+    NodeCategory,
+    NodeItem,
+    NodeItemCustom,
+)
 
 
 # Subclasses for standard node types
diff --git a/source/blender/alembic/intern/abc_archive.cc b/source/blender/alembic/intern/abc_archive.cc
index 0985a06d732..5f8fc1a3739 100644
--- a/source/blender/alembic/intern/abc_archive.cc
+++ b/source/blender/alembic/intern/abc_archive.cc
@@ -113,25 +113,25 @@ static OArchive create_archive(std::ostream *ostream,
                                Alembic::Abc::MetaData &md,
                                bool ogawa)
 {
-    md.set(Alembic::Abc::kApplicationNameKey, "Blender");
+	md.set(Alembic::Abc::kApplicationNameKey, "Blender");
 	md.set(Alembic::Abc::kUserDescriptionKey, scene_name);
 
-    time_t raw_time;
-    time(&raw_time);
-    char buffer[128];
+	time_t raw_time;
+	time(&raw_time);
+	char buffer[128];
 
 #if defined _WIN32 || defined _WIN64
-    ctime_s(buffer, 128, &raw_time);
+	ctime_s(buffer, 128, &raw_time);
 #else
-    ctime_r(&raw_time, buffer);
+	ctime_r(&raw_time, buffer);
 #endif
 
-    const std::size_t buffer_len = strlen(buffer);
-    if (buffer_len > 0 && buffer[buffer_len - 1] == '\n') {
-        buffer[buffer_len - 1] = '\0';
-    }
+	const std::size_t buffer_len = strlen(buffer);
+	if (buffer_len > 0 && buffer[buffer_len - 1] == '\n') {
+		buffer[buffer_len - 1] = '\0';
+	}
 
-    md.set(Alembic::Abc::kDateWrittenKey, buffer);
+	md.set(Alembic::Abc::kDateWrittenKey, buffer);
 
 	ErrorHandler::Policy policy = ErrorHandler::kThrowPolicy;
 
diff --git a/source/blender/alembic/intern/abc_curves.cc b/source/blender/alembic/intern/abc_curves.cc
index 4ecb9d944f2..0542255d84b 100644
--- a/source/blender/alembic/intern/abc_curves.cc
+++ b/source/blender/alembic/intern/abc_curves.cc
@@ -102,7 +102,7 @@ void AbcCurveWriter::do_write()
 			const BPoint *point = nurbs->bp;
 
 			for (int i = 0; i < totpoint; ++i, ++point) {
-				copy_zup_yup(temp_vert.getValue(), point->vec);
+				copy_yup_from_zup(temp_vert.getValue(), point->vec);
 				verts.push_back(temp_vert);
 				weights.push_back(point->vec[3]);
 				widths.push_back(point->radius);
@@ -118,7 +118,7 @@ void AbcCurveWriter::do_write()
 
 			/* TODO(kevin): store info about handles, Alembic doesn't have this. */
 			for (int i = 0; i < totpoint; ++i, ++bezier) {
-				copy_zup_yup(temp_vert.getValue(), bezier->vec[1]);
+				copy_yup_from_zup(temp_vert.getValue(), bezier->vec[1]);
 				verts.push_back(temp_vert);
 				widths.push_back(bezier->radius);
 			}
@@ -322,7 +322,7 @@ void read_curve_sample(Curve *cu, const ICurvesSchema &schema, const float time)
 				weight = (*weights)[idx];
 			}
 
-			copy_yup_zup(bp->vec, pos.getValue());
+			copy_zup_from_yup(bp->vec, pos.getValue());
 			bp->vec[3] = weight;
 			bp->f1 = SELECT;
 			bp->radius = radius;
@@ -361,7 +361,7 @@ void read_curve_sample(Curve *cu, const ICurvesSchema &schema, const float time)
  * object directly and create a new DerivedMesh from that. Also we might need to
  * create new or delete existing NURBS in the curve.
  */
-DerivedMesh *AbcCurveReader::read_derivedmesh(DerivedMesh */*dm*/, const float time, int /*read_flag*/, const char **/*err_str*/)
+DerivedMesh *AbcCurveReader::read_derivedmesh(DerivedMesh * /*dm*/, const float time, int /*read_flag*/, const char ** /*err_str*/)
 {
 	ISampleSelector sample_sel(time);
 	const ICurvesSchema::Sample sample = m_curves_schema.getValue(sample_sel);
@@ -389,7 +389,7 @@ DerivedMesh *AbcCurveReader::read_derivedmesh(DerivedMesh */*dm*/, const float t
 
 				for (int i = 0; i < totpoint; ++i, ++point, ++vertex_idx) {
 					const Imath::V3f &pos = (*positions)[vertex_idx];
-					copy_yup_zup(point->vec, pos.getValue());
+					copy_zup_from_yup(point->vec, pos.getValue());
 				}
 			}
 			else if (nurbs->bezt) {
@@ -397,7 +397,7 @@ DerivedMesh *AbcCurveReader::read_derivedmesh(DerivedMesh */*dm*/, const float t
 
 				for (int i = 0; i < totpoint; ++i, ++bezier, ++vertex_idx) {
 					const Imath::V3f &pos = (*positions)[vertex_idx];
-					copy_yup_zup(bezier->vec[1], pos.getValue());
+					copy_zup_from_yup(bezier->vec[1], pos.getValue());
 				}
 			}
 		}
diff --git a/source/blender/alembic/intern/abc_customdata.cc b/source/blender/alembic/intern/abc_customdata.cc
index ebf1b2ba96e..0d11ab79ddd 100644
--- a/source/blender/alembic/intern/abc_customdata.cc
+++ b/source/blender/alembic/intern/abc_customdata.cc
@@ -327,6 +327,11 @@ static void read_custom_data_ex(const ICompoundProperty &prop,
 	}
 	else if (data_type == CD_MLOOPUV) {
 		IV2fGeomParam uv_param(prop, prop_header.getName());
+
+		if (!uv_param.isIndexed()) {
+			return;
+		}
+
 		IV2fGeomParam::Sample sample;
 		uv_param.getIndexed(sample, iss);
 
diff --git a/source/blender/alembic/intern/abc_exporter.cc b/source/blender/alembic/intern/abc_exporter.cc
index ff8b0442ab6..564157fbc97 100644
--- a/source/blender/alembic/intern/abc_exporter.cc
+++ b/source/blender/alembic/intern/abc_exporter.cc
@@ -47,7 +47,7 @@ extern "C" {
 
 #ifdef WIN32
 /* needed for MSCV because of snprintf from BLI_string */
-#	include "BLI_winstuff.h"
+#  include "BLI_winstuff.h"
 #endif
 
 #include "BKE_anim.h"
@@ -347,7 +347,7 @@ void AbcExporter::createTransformWritersHierarchy(EvaluationContext *eval_ctx)
 		Object *ob = base->object;
 
 		if (export_object(&m_settings, ob)) {
-			switch(ob->type) {
+			switch (ob->type) {
 				case OB_LAMP:
 				case OB_LATTICE:
 				case OB_MBALL:
@@ -382,7 +382,10 @@ void AbcExporter::createTransformWritersFlat()
 
 void AbcExporter::exploreTransform(EvaluationContext *eval_ctx, Object *ob, Object *parent, Object *dupliObParent)
 {
-	createTransformWriter(ob, parent, dupliObParent);
+
+	if (export_object(&m_settings, ob) && object_is_shape(ob)) {
+		createTransformWriter(ob, parent, dupliObParent);
+	}
 
 	ListBase *lb = object_duplilist(eval_ctx, m_scene, ob);
 
@@ -410,8 +413,12 @@ void AbcExporter::createTransformWriter(Object *ob, Object *parent, Object *dupl
 {
 	const std::string name = get_object_dag_path_name(ob, dupliObParent);
 
+	/* An object should not be its own parent, or we'll get infinite loops. */
+	BLI_assert(ob != parent);
+	BLI_assert(ob != dupliObParent);
+
 	/* check if we have already created a transform writer for this object */
-	if (m_xforms.find(name) != m_xforms.end()){
+	if (getXForm(name) != NULL) {
 		std::cerr << "xform " << name << " already exists\n";
 		return;
 	}
@@ -426,6 +433,14 @@ void AbcExporter::createTransformWriter(Object *ob, Object *parent, Object *dupl
 			if (parent->parent) {
 				createTransformWriter(parent, parent->parent, dupliObParent);
 			}
+			else if (parent == dupliObParent) {
+				if (dupliObParent->parent == NULL) {
+					createTransformWriter(parent, NULL, NULL);
+				}
+				else {
+					createTransformWriter(parent, dupliObParent->parent, dupliObParent->parent);
+				}
+			}
 			else {
 				createTransformWriter(parent, dupliObParent, dupliObParent);
 			}
@@ -518,7 +533,7 @@ void AbcExporter::createShapeWriter(Object *ob, Object *dupliObParent)
 		}
 	}
 
-	switch(ob->type) {
+	switch (ob->type) {
 		case OB_MESH:
 		{
 			Mesh *me = static_cast<Mesh *>(ob->data);
diff --git a/source/blender/alembic/intern/abc_hair.cc b/source/blender/alembic/intern/abc_hair.cc
index 14bcf6731ea..e328df24e53 100644
--- a/source/blender/alembic/intern/abc_hair.cc
+++ b/source/blender/alembic/intern/abc_hair.cc
@@ -77,7 +77,6 @@ void AbcHairWriter::do_write()
 
 	DerivedMesh *dm = mesh_create_derived_view(m_scene, m_object, CD_MASK_MESH);
 	DM_ensure_tessface(dm);
-	DM_update_tessface_data(dm);
 
 	std::vector<Imath::V3f> verts;
 	std::vector<int32_t> hvertices;
@@ -164,7 +163,7 @@ void AbcHairWriter::write_hair_sample(DerivedMesh *dm,
 
 					psys_interpolate_face(mverts, face, tface, NULL, mapfw, vec, normal, NULL, NULL, NULL, NULL);
 
-					copy_zup_yup(tmp_nor.getValue(), normal);
+					copy_yup_from_zup(tmp_nor.getValue(), normal);
 					norm_values.push_back(tmp_nor);
 				}
 			}
@@ -198,7 +197,7 @@ void AbcHairWriter::write_hair_sample(DerivedMesh *dm,
 						MVert *mv = mverts + vtx[o];
 
 						normal_short_to_float_v3(normal, mv->no);
-						copy_zup_yup(tmp_nor.getValue(), normal);
+						copy_yup_from_zup(tmp_nor.getValue(), normal);
 						norm_values.push_back(tmp_nor);
 						found = true;
 						break;
diff --git a/source/blender/alembic/intern/abc_mesh.cc b/source/blender/alembic/intern/abc_mesh.cc
index bdd75f93189..5a57e43326a 100644
--- a/source/blender/alembic/intern/abc_mesh.cc
+++ b/source/blender/alembic/intern/abc_mesh.cc
@@ -112,7 +112,7 @@ static void get_vertices(DerivedMesh *dm, std::vector<Imath::V3f> &points)
 	MVert *verts = dm->getVertArray(dm);
 
 	for (int i = 0, e = dm->getNumVerts(dm); i < e; ++i) {
-		copy_zup_yup(points[i].getValue(), verts[i].co);
+		copy_yup_from_zup(points[i].getValue(), verts[i].co);
 	}
 }
 
@@ -182,7 +182,7 @@ static void get_vertex_normals(DerivedMesh *dm, std::vector<Imath::V3f> &normals
 
 	for (int i = 0, e = dm->getNumVerts(dm); i < e; ++i) {
 		normal_short_to_float_v3(no, verts[i].no);
-		copy_zup_yup(normals[i].getValue(), no);
+		copy_yup_from_zup(normals[i].getValue(), no);
 	}
 }
 
@@ -211,7 +211,7 @@ static void get_loop_normals(DerivedMesh *dm, std::vector<Imath::V3f> &normals)
 
 			for (int j = 0; j < mp->totloop; --ml, ++j, ++loop_index) {
 				const int index = ml->v;
-				copy_zup_yup(normals[loop_index].getValue(), lnors[index]);
+				copy_yup_from_zup(normals[loop_index].getValue(), lnors[index]);
 			}
 		}
 	}
@@ -226,14 +226,14 @@ static void get_loop_normals(DerivedMesh *dm, std::vector<Imath::V3f> &normals)
 				BKE_mesh_calc_poly_normal(mp, ml - (mp->totloop - 1), verts, no);
 
 				for (int j = 0; j < mp->totloop; --ml, ++j, ++loop_index) {
-					copy_zup_yup(normals[loop_index].getValue(), no);
+					copy_yup_from_zup(normals[loop_index].getValue(), no);
 				}
 			}
 			else {
 				/* Smooth shaded, use individual vert normals. */
 				for (int j = 0; j < mp->totloop; --ml, ++j, ++loop_index) {
 					normal_short_to_float_v3(no, verts[ml->v].no);
-					copy_zup_yup(normals[loop_index].getValue(), no);
+					copy_yup_from_zup(normals[loop_index].getValue(), no);
 				}
 			}
 		}
@@ -590,7 +590,7 @@ void AbcMeshWriter::getVelocities(DerivedMesh *dm, std::vector<Imath::V3f> &vels
 		float *mesh_vels = reinterpret_cast<float *>(fss->meshVelocities);
 
 		for (int i = 0; i < totverts; ++i) {
-			copy_zup_yup(vels[i].getValue(), mesh_vels);
+			copy_yup_from_zup(vels[i].getValue(), mesh_vels);
 			mesh_vels += 3;
 		}
 	}
@@ -691,7 +691,7 @@ static void assign_materials(Main *bmain, Object *ob, const std::map<std::string
 				assigned_name = mat_iter->second;
 			}
 
-			assign_material(ob, assigned_name, it->second, BKE_MAT_ASSIGN_OBJECT);
+			assign_material(ob, assigned_name, it->second, BKE_MAT_ASSIGN_OBDATA);
 		}
 	}
 }
@@ -726,7 +726,7 @@ static void read_mverts_interp(MVert *mverts, const P3fArraySamplePtr &positions
 		const Imath::V3f &ceil_pos = (*ceil_positions)[i];
 
 		interp_v3_v3v3(tmp, floor_pos.getValue(), ceil_pos.getValue(), weight);
-		copy_yup_zup(mvert.co, tmp);
+		copy_zup_from_yup(mvert.co, tmp);
 
 		mvert.bweight = 0;
 	}
@@ -755,7 +755,7 @@ void read_mverts(MVert *mverts, const P3fArraySamplePtr &positions, const N3fArr
 		MVert &mvert = mverts[i];
 		Imath::V3f pos_in = (*positions)[i];
 
-		copy_yup_zup(mvert.co, pos_in.getValue());
+		copy_zup_from_yup(mvert.co, pos_in.getValue());
 
 		mvert.bweight = 0;
 
@@ -765,7 +765,7 @@ void read_mverts(MVert *mverts, const P3fArraySamplePtr &positions, const N3fArr
 			short no[3];
 			normal_float_to_short_v3(no, nor_in.getValue());
 
-			copy_yup_zup(mvert.no, no);
+			copy_zup_from_yup(mvert.no, no);
 		}
 	}
 }
diff --git a/source/blender/alembic/intern/abc_nurbs.cc b/source/blender/alembic/intern/abc_nurbs.cc
index 4f57dfdae9e..d0b9561f679 100644
--- a/source/blender/alembic/intern/abc_nurbs.cc
+++ b/source/blender/alembic/intern/abc_nurbs.cc
@@ -153,7 +153,7 @@ void AbcNurbsWriter::do_write()
 		const BPoint *bp = nu->bp;
 
 		for (int i = 0; i < size; ++i, ++bp) {
-			copy_zup_yup(positions[i].getValue(), bp->vec);
+			copy_yup_from_zup(positions[i].getValue(), bp->vec);
 			weights[i] = bp->vec[3];
 		}
 
@@ -281,7 +281,7 @@ void AbcNurbsReader::readObjectData(Main *bmain, float time)
 				posw_in = (*weights)[i];
 			}
 
-			copy_yup_zup(bp->vec, pos_in.getValue());
+			copy_zup_from_yup(bp->vec, pos_in.getValue());
 			bp->vec[3] = posw_in;
 			bp->f1 = SELECT;
 			bp->radius = 1.0f;
diff --git a/source/blender/alembic/intern/abc_object.cc b/source/blender/alembic/intern/abc_object.cc
index 9dfccdb8c7f..a5b8af542fc 100644
--- a/source/blender/alembic/intern/abc_object.cc
+++ b/source/blender/alembic/intern/abc_object.cc
@@ -97,14 +97,14 @@ Imath::Box3d AbcObjectWriter::bounds()
 		return Imath::Box3d();
 	}
 
-	/* Convert Z-up to Y-up. */
+	/* Convert Z-up to Y-up. This also changes which vector goes into which min/max property. */
 	this->m_bounds.min.x = bb->vec[0][0];
 	this->m_bounds.min.y = bb->vec[0][2];
-	this->m_bounds.min.z = -bb->vec[0][1];
+	this->m_bounds.min.z = -bb->vec[6][1];
 
 	this->m_bounds.max.x = bb->vec[6][0];
 	this->m_bounds.max.y = bb->vec[6][2];
-	this->m_bounds.max.z = -bb->vec[6][1];
+	this->m_bounds.max.z = -bb->vec[0][1];
 
 	return this->m_bounds;
 }
diff --git a/source/blender/alembic/intern/abc_points.cc b/source/blender/alembic/intern/abc_points.cc
index 4c78f3e83c7..fc84759b1d9 100644
--- a/source/blender/alembic/intern/abc_points.cc
+++ b/source/blender/alembic/intern/abc_points.cc
@@ -200,7 +200,7 @@ void read_points_sample(const IPointsSchema &schema,
 	read_mverts(config.mvert, positions, vnormals);
 }
 
-DerivedMesh *AbcPointsReader::read_derivedmesh(DerivedMesh *dm, const float time, int /*read_flag*/, const char **/*err_str*/)
+DerivedMesh *AbcPointsReader::read_derivedmesh(DerivedMesh *dm, const float time, int /*read_flag*/, const char ** /*err_str*/)
 {
 	ISampleSelector sample_sel(time);
 	const IPointsSchema::Sample sample = m_schema.getValue(sample_sel);
diff --git a/source/blender/alembic/intern/abc_transform.cc b/source/blender/alembic/intern/abc_transform.cc
index e2fc7674c4e..2c6ef09326c 100644
--- a/source/blender/alembic/intern/abc_transform.cc
+++ b/source/blender/alembic/intern/abc_transform.cc
@@ -122,7 +122,7 @@ Imath::Box3d AbcTransformWriter::bounds()
 	return Imath::transform(bounds, m_matrix);
 }
 
-bool AbcTransformWriter::hasAnimation(Object */*ob*/) const
+bool AbcTransformWriter::hasAnimation(Object * /*ob*/) const
 {
 	/* TODO(kevin): implement this. */
 	return true;
@@ -146,6 +146,6 @@ bool AbcEmptyReader::valid() const
 
 void AbcEmptyReader::readObjectData(Main *bmain, float /*time*/)
 {
-	m_object = BKE_object_add_only_object(bmain, OB_EMPTY, m_object_name.c_str());
+	m_object = BKE_object_add_only_object(bmain, OB_EMPTY, m_data_name.c_str());
 	m_object->data = NULL;
 }
diff --git a/source/blender/alembic/intern/abc_util.cc b/source/blender/alembic/intern/abc_util.cc
index f8ce72d845d..50fa43a3491 100644
--- a/source/blender/alembic/intern/abc_util.cc
+++ b/source/blender/alembic/intern/abc_util.cc
@@ -37,6 +37,8 @@ extern "C" {
 #include "DNA_object_types.h"
 
 #include "BLI_math.h"
+
+#include "PIL_time.h"
 }
 
 std::string get_id_name(Object *ob)
@@ -197,7 +199,7 @@ void create_transform_matrix(float r_mat[4][4])
 	copy_m4_m3(transform_mat, rot_mat);
 
 	/* Add translation to transformation matrix. */
-	copy_yup_zup(transform_mat[3], loc);
+	copy_zup_from_yup(transform_mat[3], loc);
 
 	/* Create scale matrix. */
 	scale_mat[0][0] = scale[0];
@@ -417,7 +419,7 @@ void create_transform_matrix(Object *obj, float transform_mat[4][4])
 	copy_m4_m3(transform_mat, rot_mat);
 
 	/* Add translation to transformation matrix. */
-	copy_zup_yup(transform_mat[3], loc);
+	copy_yup_from_zup(transform_mat[3], loc);
 
 	/* Create scale matrix. */
 	scale_mat[0][0] = scale[0];
@@ -523,3 +525,15 @@ AbcObjectReader *create_reader(const Alembic::AbcGeom::IObject &object, ImportSe
 
 	return reader;
 }
+
+/* ********************** */
+
+ScopeTimer::ScopeTimer(const char *message)
+	: m_message(message)
+	, m_start(PIL_check_seconds_timer())
+{}
+
+ScopeTimer::~ScopeTimer()
+{
+	fprintf(stderr, "%s: %fs\n", m_message, PIL_check_seconds_timer() - m_start);
+}
diff --git a/source/blender/alembic/intern/abc_util.h b/source/blender/alembic/intern/abc_util.h
index 60a96855d14..85ba4d5c9c7 100644
--- a/source/blender/alembic/intern/abc_util.h
+++ b/source/blender/alembic/intern/abc_util.h
@@ -116,14 +116,14 @@ AbcObjectReader *create_reader(const Alembic::AbcGeom::IObject &object, ImportSe
 
 /* Copy from Y-up to Z-up. */
 
-ABC_INLINE void copy_yup_zup(float zup[3], const float yup[3])
+ABC_INLINE void copy_zup_from_yup(float zup[3], const float yup[3])
 {
 	zup[0] = yup[0];
 	zup[1] = -yup[2];
 	zup[2] = yup[1];
 }
 
-ABC_INLINE void copy_yup_zup(short zup[3], const short yup[3])
+ABC_INLINE void copy_zup_from_yup(short zup[3], const short yup[3])
 {
 	zup[0] = yup[0];
 	zup[1] = -yup[2];
@@ -132,18 +132,37 @@ ABC_INLINE void copy_yup_zup(short zup[3], const short yup[3])
 
 /* Copy from Z-up to Y-up. */
 
-ABC_INLINE void copy_zup_yup(float yup[3], const float zup[3])
+ABC_INLINE void copy_yup_from_zup(float yup[3], const float zup[3])
 {
 	yup[0] = zup[0];
 	yup[1] = zup[2];
 	yup[2] = -zup[1];
 }
 
-ABC_INLINE void copy_zup_yup(short yup[3], const short zup[3])
+ABC_INLINE void copy_yup_from_zup(short yup[3], const short zup[3])
 {
 	yup[0] = zup[0];
 	yup[1] = zup[2];
 	yup[2] = -zup[1];
 }
 
+/* *************************** */
+
+#undef ABC_DEBUG_TIME
+
+class ScopeTimer {
+	const char *m_message;
+	double m_start;
+
+public:
+	ScopeTimer(const char *message);
+	~ScopeTimer();
+};
+
+#ifdef ABC_DEBUG_TIME
+#	define SCOPE_TIMER(message) ScopeTimer prof(message)
+#else
+#	define SCOPE_TIMER(message)
+#endif
+
 #endif  /* __ABC_UTIL_H__ */
diff --git a/source/blender/alembic/intern/alembic_capi.cc b/source/blender/alembic/intern/alembic_capi.cc
index d8d017119b1..dc5146a26e0 100644
--- a/source/blender/alembic/intern/alembic_capi.cc
+++ b/source/blender/alembic/intern/alembic_capi.cc
@@ -542,6 +542,8 @@ ABC_INLINE bool is_mesh_and_strands(const IObject &object)
 
 static void import_startjob(void *user_data, short *stop, short *do_update, float *progress)
 {
+	SCOPE_TIMER("Alembic import, objects reading and creation");
+
 	ImportJobData *data = static_cast<ImportJobData *>(user_data);
 
 	data->stop = stop;
@@ -677,6 +679,8 @@ static void import_startjob(void *user_data, short *stop, short *do_update, floa
 
 static void import_endjob(void *user_data)
 {
+	SCOPE_TIMER("Alembic import, cleanup");
+
 	ImportJobData *data = static_cast<ImportJobData *>(user_data);
 
 	std::vector<AbcObjectReader *>::iterator iter;
diff --git a/source/blender/blenkernel/BKE_appdir.h b/source/blender/blenkernel/BKE_appdir.h
index 077fe2a629c..ac8f861fa56 100644
--- a/source/blender/blenkernel/BKE_appdir.h
+++ b/source/blender/blenkernel/BKE_appdir.h
@@ -27,11 +27,15 @@
 /* note on naming: typical _get() suffix is omitted here,
  * since its the main purpose of the API. */
 const char *BKE_appdir_folder_default(void);
+const char *BKE_appdir_folder_id_ex(const int folder_id, const char *subfolder, char *path, size_t path_len);
 const char *BKE_appdir_folder_id(const int folder_id, const char *subfolder);
 const char *BKE_appdir_folder_id_create(const int folder_id, const char *subfolder);
 const char *BKE_appdir_folder_id_user_notest(const int folder_id, const char *subfolder);
 const char *BKE_appdir_folder_id_version(const int folder_id, const int ver, const bool do_check);
 
+bool BKE_appdir_app_template_any(void);
+bool BKE_appdir_app_template_id_search(const char *app_template, char *path, size_t path_len);
+
 /* Initialize path to program executable */
 void        BKE_appdir_program_path_init(const char *argv0);
 
diff --git a/source/blender/blenkernel/BKE_blender.h b/source/blender/blenkernel/BKE_blender.h
index d2d9c763031..d55926ffb1e 100644
--- a/source/blender/blenkernel/BKE_blender.h
+++ b/source/blender/blenkernel/BKE_blender.h
@@ -38,14 +38,22 @@
 extern "C" {
 #endif
 
+struct UserDef;
+
 void BKE_blender_free(void);
 
 void BKE_blender_globals_init(void);
 void BKE_blender_globals_clear(void);
+void BKE_blender_version_string(
+        char *version_str, size_t maxncpy,
+        short version, short subversion, bool v_prefix, bool include_subversion);
 
-void BKE_blender_userdef_free(void);
+void BKE_blender_userdef_set_data(struct UserDef *userdef);
+void BKE_blender_userdef_free_data(struct UserDef *userdef);
 void BKE_blender_userdef_refresh(void);
-	
+
+void BKE_blender_userdef_set_app_template(struct UserDef *userdef);
+
 /* set this callback when a UI is running */
 void BKE_blender_callback_test_break_set(void (*func)(void));
 int  BKE_blender_test_break(void);
diff --git a/source/blender/blenkernel/BKE_blendfile.h b/source/blender/blenkernel/BKE_blendfile.h
index 6767fce3abd..ac58451e412 100644
--- a/source/blender/blenkernel/BKE_blendfile.h
+++ b/source/blender/blenkernel/BKE_blendfile.h
@@ -33,8 +33,7 @@ struct ID;
 struct Main;
 struct MemFile;
 struct ReportList;
-
-int BKE_blendfile_read(struct bContext *C, const char *filepath, struct ReportList *reports);
+struct UserDef;
 
 enum {
 	BKE_BLENDFILE_READ_FAIL             = 0, /* no load */
@@ -42,15 +41,24 @@ enum {
 	BKE_BLENDFILE_READ_OK_USERPREFS     = 2, /* OK, and with new user settings */
 };
 
+int BKE_blendfile_read(
+        struct bContext *C, const char *filepath,
+        struct ReportList *reports, int skip_flag);
 bool BKE_blendfile_read_from_memory(
-        struct bContext *C, const void *filebuf,
-        int filelength, struct ReportList *reports, bool update_defaults);
+        struct bContext *C, const void *filebuf, int filelength,
+        struct ReportList *reports, int skip_flag, bool update_defaults);
 bool BKE_blendfile_read_from_memfile(
         struct bContext *C, struct MemFile *memfile,
+        struct ReportList *reports, int skip_flag);
+void BKE_blendfile_read_make_empty(struct bContext *C);
+
+struct UserDef *BKE_blendfile_userdef_read(
+        const char *filepath, struct ReportList *reports);
+struct UserDef *BKE_blendfile_userdef_read_from_memory(
+        const void *filebuf, int filelength,
         struct ReportList *reports);
 
-int BKE_blendfile_read_userdef(const char *filepath, struct ReportList *reports);
-int BKE_blendfile_write_userdef(const char *filepath, struct ReportList *reports);
+int BKE_blendfile_userdef_write(const char *filepath, struct ReportList *reports);
 
 
 /* partial blend file writing */
diff --git a/source/blender/blenkernel/BKE_bvhutils.h b/source/blender/blenkernel/BKE_bvhutils.h
index 07db2217bac..cb72f0859d5 100644
--- a/source/blender/blenkernel/BKE_bvhutils.h
+++ b/source/blender/blenkernel/BKE_bvhutils.h
@@ -54,7 +54,6 @@ typedef struct BVHTreeFromEditMesh {
 	/* default callbacks to bvh nearest and raycast */
 	BVHTree_NearestPointCallback nearest_callback;
 	BVHTree_RayCastCallback raycast_callback;
-	BVHTree_NearestToRayCallback nearest_to_ray_callback;
 
 	struct BMEditMesh *em;
 
@@ -75,7 +74,6 @@ typedef struct BVHTreeFromMesh {
 	/* default callbacks to bvh nearest and raycast */
 	BVHTree_NearestPointCallback nearest_callback;
 	BVHTree_RayCastCallback raycast_callback;
-	BVHTree_NearestToRayCallback nearest_to_ray_callback;
 
 	/* Vertex array, so that callbacks have instante access to data */
 	const struct MVert *vert;
@@ -104,7 +102,7 @@ typedef struct BVHTreeFromMesh {
  * The tree is build in mesh space coordinates, this means special care must be made on queries
  * so that the coordinates and rays are first translated on the mesh local coordinates.
  * Reason for this is that bvh_from_mesh_* can use a cache in some cases and so it becomes possible to reuse a BVHTree.
- * 
+ *
  * free_bvhtree_from_mesh should be called when the tree is no longer needed.
  */
 BVHTree *bvhtree_from_editmesh_verts(
@@ -118,7 +116,7 @@ BVHTree *bvhtree_from_editmesh_verts_ex(
 BVHTree *bvhtree_from_mesh_verts(
         struct BVHTreeFromMesh *data, struct DerivedMesh *mesh, float epsilon, int tree_type, int axis);
 BVHTree *bvhtree_from_mesh_verts_ex(
-        struct BVHTreeFromMesh *data, struct MVert *vert, const int numVerts,
+        struct BVHTreeFromMesh *data, const struct MVert *vert, const int numVerts,
         const bool vert_allocated, const BLI_bitmap *mask, int verts_num_active,
         float epsilon, int tree_type, int axis);
 
@@ -133,14 +131,20 @@ BVHTree *bvhtree_from_editmesh_edges_ex(
 BVHTree *bvhtree_from_mesh_edges(
         struct BVHTreeFromMesh *data, struct DerivedMesh *mesh,
         float epsilon, int tree_type, int axis);
+BVHTree *bvhtree_from_mesh_edges_ex(
+        struct BVHTreeFromMesh *data,
+        const struct MVert *vert, const bool vert_allocated,
+        const struct MEdge *edge, const int edges_num, const bool edge_allocated,
+        const BLI_bitmap *edges_mask, int edges_num_active,
+        float epsilon, int tree_type, int axis);
 
 BVHTree *bvhtree_from_mesh_faces(
         struct BVHTreeFromMesh *data, struct DerivedMesh *mesh, float epsilon,
         int tree_type, int axis);
 BVHTree *bvhtree_from_mesh_faces_ex(
         struct BVHTreeFromMesh *data,
-        struct MVert *vert, const bool vert_allocated,
-        struct MFace *face, const int numFaces, const bool face_allocated,
+        const struct MVert *vert, const bool vert_allocated,
+        const struct MFace *face, const int numFaces, const bool face_allocated,
         const BLI_bitmap *mask, int numFaces_active,
         float epsilon, int tree_type, int axis);
 
diff --git a/source/blender/blenkernel/BKE_cachefile.h b/source/blender/blenkernel/BKE_cachefile.h
index 7e1c069df9a..0d4ed2083b1 100644
--- a/source/blender/blenkernel/BKE_cachefile.h
+++ b/source/blender/blenkernel/BKE_cachefile.h
@@ -55,7 +55,7 @@ void BKE_cachefile_reload(const struct Main *bmain, struct CacheFile *cache_file
 
 void BKE_cachefile_ensure_handle(const struct Main *bmain, struct CacheFile *cache_file);
 
-void BKE_cachefile_update_frame(struct Main *bmain, struct Scene *scene,const float ctime, const float fps);
+void BKE_cachefile_update_frame(struct Main *bmain, struct Scene *scene, const float ctime, const float fps);
 
 bool BKE_cachefile_filepath_get(
         const struct Main *bmain, const struct CacheFile *cache_file, float frame,
diff --git a/source/blender/blenkernel/BKE_cdderivedmesh.h b/source/blender/blenkernel/BKE_cdderivedmesh.h
index 9948f21ba90..4876461bfe0 100644
--- a/source/blender/blenkernel/BKE_cdderivedmesh.h
+++ b/source/blender/blenkernel/BKE_cdderivedmesh.h
@@ -79,6 +79,7 @@ DerivedMesh *CDDM_from_curve_displist(struct Object *ob, struct ListBase *dispba
  */
 struct DerivedMesh *CDDM_copy(struct DerivedMesh *dm);
 struct DerivedMesh *CDDM_copy_from_tessface(struct DerivedMesh *dm);
+struct DerivedMesh *CDDM_copy_with_tessface(struct DerivedMesh *dm);
 
 /* creates a CDDerivedMesh with the same layer stack configuration as the
  * given DerivedMesh and containing the requested numbers of elements.
diff --git a/source/blender/blenkernel/BKE_cloth.h b/source/blender/blenkernel/BKE_cloth.h
index 36330242f18..6c517bd02df 100644
--- a/source/blender/blenkernel/BKE_cloth.h
+++ b/source/blender/blenkernel/BKE_cloth.h
@@ -136,9 +136,6 @@ typedef struct ClothSpring {
 	float	restlen;	/* The original length of the spring.	*/
 	int	type;		/* types defined in BKE_cloth.h ("springType") */
 	int	flags; 		/* defined in BKE_cloth.h, e.g. deactivated due to tearing */
-	float dfdx[3][3];
-	float dfdv[3][3];
-	float f[3];
 	float 	stiffness;	/* stiffness factor from the vertex groups */
 	float editrestlen;
 	
@@ -240,9 +237,6 @@ void bvhselftree_update_from_cloth(struct ClothModifierData *clmd, bool moving);
 // needed for button_object.c
 void cloth_clear_cache (struct Object *ob, struct ClothModifierData *clmd, float framenr );
 
-// needed for cloth.c
-int cloth_add_spring (struct ClothModifierData *clmd, unsigned int indexA, unsigned int indexB, float restlength, int spring_type);
-
 void cloth_parallel_transport_hair_frame(float mat[3][3], const float dir_old[3], const float dir_new[3]);
 
 ////////////////////////////////////////////////
diff --git a/source/blender/blenkernel/BKE_curve.h b/source/blender/blenkernel/BKE_curve.h
index 5558786d254..e111bd0e16b 100644
--- a/source/blender/blenkernel/BKE_curve.h
+++ b/source/blender/blenkernel/BKE_curve.h
@@ -36,6 +36,7 @@
 struct BezTriple;
 struct Curve;
 struct EditNurb;
+struct GHash;
 struct ListBase;
 struct Main;
 struct Nurb;
@@ -52,6 +53,13 @@ typedef struct CurveCache {
 	struct Path *path;
 } CurveCache;
 
+/* Definitions needed for shape keys */
+typedef struct CVKeyIndex {
+	void *orig_cv;
+	int key_index, nu_index, pt_index, vertex_index;
+	bool switched;
+} CVKeyIndex;
+
 #define KNOTSU(nu)      ( (nu)->orderu + (nu)->pntsu + (((nu)->flagu & CU_NURB_CYCLIC) ? ((nu)->orderu - 1) : 0) )
 #define KNOTSV(nu)      ( (nu)->orderv + (nu)->pntsv + (((nu)->flagv & CU_NURB_CYCLIC) ? ((nu)->orderv - 1) : 0) )
 
@@ -108,7 +116,8 @@ void BK_curve_nurbs_vertexCos_apply(struct ListBase *lb, float (*vertexCos)[3]);
 float (*BKE_curve_nurbs_keyVertexCos_get(struct ListBase *lb, float *key))[3];
 void BKE_curve_nurbs_keyVertexTilts_apply(struct ListBase *lb, float *key);
 
-void BKE_curve_editNurb_keyIndex_free(struct EditNurb *editnurb);
+void BKE_curve_editNurb_keyIndex_delCV(struct GHash *keyindex, const void *cv);
+void BKE_curve_editNurb_keyIndex_free(struct GHash **keyindex);
 void BKE_curve_editNurb_free(struct Curve *cu);
 struct ListBase *BKE_curve_editNurbs_get(struct Curve *cu);
 
diff --git a/source/blender/blenkernel/BKE_mesh.h b/source/blender/blenkernel/BKE_mesh.h
index d41878825bb..b83bec5a302 100644
--- a/source/blender/blenkernel/BKE_mesh.h
+++ b/source/blender/blenkernel/BKE_mesh.h
@@ -131,8 +131,7 @@ bool BKE_mesh_uv_cdlayer_rename(struct Mesh *me, const char *old_name, const cha
 
 float (*BKE_mesh_vertexCos_get(const struct Mesh *me, int *r_numVerts))[3];
 
-void BKE_mesh_calc_normals_split(struct Mesh *mesh);
-void BKE_mesh_split_faces(struct Mesh *mesh);
+void BKE_mesh_split_faces(struct Mesh *mesh, bool free_loop_normals);
 
 struct Mesh *BKE_mesh_new_from_object(struct Main *bmain, struct Scene *sce, struct Object *ob,
                                       int apply_modifiers, int settings, int calc_tessface, int calc_undeformed);
@@ -228,6 +227,9 @@ void BKE_lnor_space_custom_normal_to_data(MLoopNorSpace *lnor_space, const float
 
 bool BKE_mesh_has_custom_loop_normals(struct Mesh *me);
 
+void BKE_mesh_calc_normals_split(struct Mesh *mesh);
+void BKE_mesh_calc_normals_split_ex(struct Mesh *mesh, struct MLoopNorSpaceArray *r_lnors_spacearr);
+
 void BKE_mesh_normals_loop_split(
         const struct MVert *mverts, const int numVerts, struct MEdge *medges, const int numEdges,
         struct MLoop *mloops, float (*r_loopnors)[3], const int numLoops,
diff --git a/source/blender/blenkernel/BKE_node.h b/source/blender/blenkernel/BKE_node.h
index e3f28b332cf..7544e8f5b3e 100644
--- a/source/blender/blenkernel/BKE_node.h
+++ b/source/blender/blenkernel/BKE_node.h
@@ -49,7 +49,7 @@
 #include "RNA_types.h"
 
 /* not very important, but the stack solver likes to know a maximum */
-#define MAX_SOCKET	64
+#define MAX_SOCKET	512
 
 struct bContext;
 struct bNode;
diff --git a/source/blender/blenkernel/BKE_object.h b/source/blender/blenkernel/BKE_object.h
index d812ab832a1..b6633ea6198 100644
--- a/source/blender/blenkernel/BKE_object.h
+++ b/source/blender/blenkernel/BKE_object.h
@@ -139,8 +139,6 @@ void BKE_boundbox_init_from_minmax(struct BoundBox *bb, const float min[3], cons
 void BKE_boundbox_calc_center_aabb(const struct BoundBox *bb, float r_cent[3]);
 void BKE_boundbox_calc_size_aabb(const struct BoundBox *bb, float r_size[3]);
 void BKE_boundbox_minmax(const struct BoundBox *bb, float obmat[4][4], float r_min[3], float r_max[3]);
-struct BoundBox *BKE_boundbox_ensure_minimum_dimensions(
-        struct BoundBox *bb, struct BoundBox *bb_temp, const float epsilon);
 
 struct BoundBox *BKE_object_boundbox_get(struct Object *ob);
 void BKE_object_dimensions_get(struct Object *ob, float vec[3]);
@@ -202,6 +200,11 @@ void BKE_object_eval_uber_data(struct EvaluationContext *eval_ctx,
                                struct Scene *scene,
                                struct Object *ob);
 
+void BKE_object_eval_cloth(struct EvaluationContext *eval_ctx,
+                           struct Scene *scene,
+                           struct Object *object);
+
+
 void BKE_object_handle_data_update(struct EvaluationContext *eval_ctx,
                                    struct Scene *scene,
                                    struct Object *ob);
diff --git a/source/blender/blenkernel/BKE_particle.h b/source/blender/blenkernel/BKE_particle.h
index e5967be0bc7..2b6a84a2f87 100644
--- a/source/blender/blenkernel/BKE_particle.h
+++ b/source/blender/blenkernel/BKE_particle.h
@@ -207,7 +207,7 @@ typedef struct ParticleCollisionElement {
 typedef struct ParticleCollision {
 	struct Object *current;
 	struct Object *hit;
-	struct Object *skip[PARTICLE_COLLISION_MAX_COLLISIONS+1];
+	struct Object *skip[PARTICLE_COLLISION_MAX_COLLISIONS + 1];
 	struct Object *emitter;
 
 	struct CollisionModifierData *md; // collision modifier for current object;
diff --git a/source/blender/blenkernel/BKE_texture.h b/source/blender/blenkernel/BKE_texture.h
index 1c5ea946f59..9a60eb29957 100644
--- a/source/blender/blenkernel/BKE_texture.h
+++ b/source/blender/blenkernel/BKE_texture.h
@@ -42,6 +42,7 @@ struct Brush;
 struct ColorBand;
 struct EnvMap;
 struct FreestyleLineStyle;
+struct ImagePool;
 struct Lamp;
 struct Main;
 struct Material;
@@ -133,6 +134,12 @@ struct OceanTex *BKE_texture_ocean_copy(struct OceanTex *ot);
 bool    BKE_texture_dependsOnTime(const struct Tex *texture);
 bool    BKE_texture_is_image_user(const struct Tex *tex);
 
+void BKE_texture_get_value_ex(
+        const struct Scene *scene, struct Tex *texture,
+        float *tex_co, struct TexResult *texres,
+        struct ImagePool *pool,
+        bool use_color_management);
+
 void BKE_texture_get_value(
         const struct Scene *scene, struct Tex *texture,
         float *tex_co, struct TexResult *texres, bool use_color_management);
diff --git a/source/blender/blenkernel/intern/DerivedMesh.c b/source/blender/blenkernel/intern/DerivedMesh.c
index 160adcc3988..8b7329cf940 100644
--- a/source/blender/blenkernel/intern/DerivedMesh.c
+++ b/source/blender/blenkernel/intern/DerivedMesh.c
@@ -2912,9 +2912,6 @@ DerivedMesh *editbmesh_get_derived_base(Object *obedit, BMEditMesh *em, CustomDa
 /* get derived mesh from an object, using editbmesh if available. */
 DerivedMesh *object_get_derived_final(Object *ob, const bool for_render)
 {
-	Mesh *me = ob->data;
-	BMEditMesh *em = me->edit_btmesh;
-
 	if (for_render) {
 		/* TODO(sergey): use proper derived render here in the future. */
 		return ob->derivedFinal;
@@ -2922,9 +2919,13 @@ DerivedMesh *object_get_derived_final(Object *ob, const bool for_render)
 
 	/* only return the editmesh if its from this object because
 	 * we don't a mesh from another object's modifier stack: T43122 */
-	if (em && (em->ob == ob)) {
-		DerivedMesh *dm = em->derivedFinal;
-		return dm;
+	if (ob->type == OB_MESH) {
+		Mesh *me = ob->data;
+		BMEditMesh *em = me->edit_btmesh;
+		if (em && (em->ob == ob)) {
+			DerivedMesh *dm = em->derivedFinal;
+			return dm;
+		}
 	}
 
 	return ob->derivedFinal;
diff --git a/source/blender/blenkernel/intern/appdir.c b/source/blender/blenkernel/intern/appdir.c
index b1dcc40279f..43fd47981b1 100644
--- a/source/blender/blenkernel/intern/appdir.c
+++ b/source/blender/blenkernel/intern/appdir.c
@@ -114,18 +114,26 @@ static char *blender_version_decimal(const int ver)
  * Concatenates path_base, (optional) path_sep and (optional) folder_name into targetpath,
  * returning true if result points to a directory.
  */
-static bool test_path(char *targetpath, const char *path_base, const char *path_sep, const char *folder_name)
+static bool test_path(
+        char *targetpath, size_t targetpath_len,
+        const char *path_base, const char *path_sep, const char *folder_name)
 {
 	char tmppath[FILE_MAX];
 	
-	if (path_sep) BLI_join_dirfile(tmppath, sizeof(tmppath), path_base, path_sep);
-	else BLI_strncpy(tmppath, path_base, sizeof(tmppath));
-
-	/* rare cases folder_name is omitted (when looking for ~/.blender/2.xx dir only) */
-	if (folder_name)
-		BLI_make_file_string("/", targetpath, tmppath, folder_name);
-	else
-		BLI_strncpy(targetpath, tmppath, sizeof(tmppath));
+	if (path_sep) {
+		BLI_join_dirfile(tmppath, sizeof(tmppath), path_base, path_sep);
+	}
+	else {
+		BLI_strncpy(tmppath, path_base, sizeof(tmppath));
+	}
+
+	/* rare cases folder_name is omitted (when looking for ~/.config/blender/2.xx dir only) */
+	if (folder_name) {
+		BLI_join_dirfile(targetpath, targetpath_len, tmppath, folder_name);
+	}
+	else {
+		BLI_strncpy(targetpath, tmppath, targetpath_len);
+	}
 	/* FIXME: why is "//" on front of tmppath expanded to "/" (by BLI_join_dirfile)
 	 * if folder_name is specified but not otherwise? */
 
@@ -179,7 +187,9 @@ static bool test_env_path(char *path, const char *envvar)
  * \param ver  To construct name of version-specific directory within bprogdir
  * \return true if such a directory exists.
  */
-static bool get_path_local(char *targetpath, const char *folder_name, const char *subfolder_name, const int ver)
+static bool get_path_local(
+        char *targetpath, size_t targetpath_len,
+        const char *folder_name, const char *subfolder_name, const int ver)
 {
 	char relfolder[FILE_MAX];
 	
@@ -201,11 +211,12 @@ static bool get_path_local(char *targetpath, const char *folder_name, const char
 
 	/* try EXECUTABLE_DIR/2.5x/folder_name - new default directory for local blender installed files */
 #ifdef __APPLE__
-	static char osx_resourses[FILE_MAX]; /* due new codesign situation in OSX > 10.9.5 we must move the blender_version dir with contents to Resources */
+	/* due new codesign situation in OSX > 10.9.5 we must move the blender_version dir with contents to Resources */
+	static char osx_resourses[FILE_MAX];
 	sprintf(osx_resourses, "%s../Resources", bprogdir);
-	return test_path(targetpath, osx_resourses, blender_version_decimal(ver), relfolder);
+	return test_path(targetpath, targetpath_len, osx_resourses, blender_version_decimal(ver), relfolder);
 #else
-	return test_path(targetpath, bprogdir, blender_version_decimal(ver), relfolder);
+	return test_path(targetpath, targetpath_len, bprogdir, blender_version_decimal(ver), relfolder);
 #endif
 }
 
@@ -219,7 +230,7 @@ static bool is_portable_install(void)
 	const int ver = BLENDER_VERSION;
 	char path[FILE_MAX];
 
-	return get_path_local(path, "config", NULL, ver);
+	return get_path_local(path, sizeof(path), "config", NULL, ver);
 }
 
 /**
@@ -233,20 +244,22 @@ static bool is_portable_install(void)
  * \param ver  Blender version, used to construct a subdirectory name
  * \return true if it was able to construct such a path.
  */
-static bool get_path_user(char *targetpath, const char *folder_name, const char *subfolder_name, const char *envvar, const int ver)
+static bool get_path_user(
+        char *targetpath, size_t targetpath_len, const char *folder_name, const char *subfolder_name,
+        const char *envvar, const int ver)
 {
 	char user_path[FILE_MAX];
 	const char *user_base_path;
 
 	/* for portable install, user path is always local */
-	if (is_portable_install())
-		return get_path_local(targetpath, folder_name, subfolder_name, ver);
-	
+	if (is_portable_install()) {
+		return get_path_local(targetpath, targetpath_len, folder_name, subfolder_name, ver);
+	}
 	user_path[0] = '\0';
 
 	if (test_env_path(user_path, envvar)) {
 		if (subfolder_name) {
-			return test_path(targetpath, user_path, NULL, subfolder_name);
+			return test_path(targetpath, targetpath_len, user_path, NULL, subfolder_name);
 		}
 		else {
 			BLI_strncpy(targetpath, user_path, FILE_MAX);
@@ -266,10 +279,10 @@ static bool get_path_user(char *targetpath, const char *folder_name, const char
 #endif
 	
 	if (subfolder_name) {
-		return test_path(targetpath, user_path, folder_name, subfolder_name);
+		return test_path(targetpath, targetpath_len, user_path, folder_name, subfolder_name);
 	}
 	else {
-		return test_path(targetpath, user_path, NULL, folder_name);
+		return test_path(targetpath, targetpath_len, user_path, NULL, folder_name);
 	}
 }
 
@@ -283,7 +296,9 @@ static bool get_path_user(char *targetpath, const char *folder_name, const char
  * \param ver  Blender version, used to construct a subdirectory name
  * \return  true if it was able to construct such a path.
  */
-static bool get_path_system(char *targetpath, const char *folder_name, const char *subfolder_name, const char *envvar, const int ver)
+static bool get_path_system(
+        char *targetpath, size_t targetpath_len, const char *folder_name, const char *subfolder_name,
+        const char *envvar, const int ver)
 {
 	char system_path[FILE_MAX];
 	const char *system_base_path;
@@ -307,13 +322,13 @@ static bool get_path_system(char *targetpath, const char *folder_name, const cha
 
 	/* try CWD/release/folder_name */
 	if (BLI_current_working_dir(cwd, sizeof(cwd))) {
-		if (test_path(targetpath, cwd, "release", relfolder)) {
+		if (test_path(targetpath, targetpath_len, cwd, "release", relfolder)) {
 			return true;
 		}
 	}
 
 	/* try EXECUTABLE_DIR/release/folder_name */
-	if (test_path(targetpath, bprogdir, "release", relfolder))
+	if (test_path(targetpath, targetpath_len, bprogdir, "release", relfolder))
 		return true;
 
 	/* end developer overrides */
@@ -324,7 +339,7 @@ static bool get_path_system(char *targetpath, const char *folder_name, const cha
 
 	if (test_env_path(system_path, envvar)) {
 		if (subfolder_name) {
-			return test_path(targetpath, system_path, NULL, subfolder_name);
+			return test_path(targetpath, targetpath_len, system_path, NULL, subfolder_name);
 		}
 		else {
 			BLI_strncpy(targetpath, system_path, FILE_MAX);
@@ -345,57 +360,63 @@ static bool get_path_system(char *targetpath, const char *folder_name, const cha
 	
 	if (subfolder_name) {
 		/* try $BLENDERPATH/folder_name/subfolder_name */
-		return test_path(targetpath, system_path, folder_name, subfolder_name);
+		return test_path(targetpath, targetpath_len, system_path, folder_name, subfolder_name);
 	}
 	else {
 		/* try $BLENDERPATH/folder_name */
-		return test_path(targetpath, system_path, NULL, folder_name);
+		return test_path(targetpath, targetpath_len, system_path, NULL, folder_name);
 	}
 }
 
-/* get a folder out of the 'folder_id' presets for paths */
-/* returns the path if found, NULL string if not */
-const char *BKE_appdir_folder_id(const int folder_id, const char *subfolder)
+/**
+ * Get a folder out of the 'folder_id' presets for paths.
+ * returns the path if found, NULL string if not
+ *
+ * \param subfolder: The name of a directory to check for,
+ * this may contain path separators but must resolve to a directory, checked with #BLI_is_dir.
+ */
+const char *BKE_appdir_folder_id_ex(
+        const int folder_id, const char *subfolder,
+        char *path, size_t path_len)
 {
 	const int ver = BLENDER_VERSION;
-	static char path[FILE_MAX] = "";
-	
+
 	switch (folder_id) {
 		case BLENDER_DATAFILES:     /* general case */
-			if (get_path_user(path, "datafiles", subfolder, "BLENDER_USER_DATAFILES", ver)) break;
-			if (get_path_local(path, "datafiles", subfolder, ver)) break;
-			if (get_path_system(path, "datafiles", subfolder, "BLENDER_SYSTEM_DATAFILES", ver)) break;
+			if (get_path_user(path, path_len, "datafiles", subfolder, "BLENDER_USER_DATAFILES", ver)) break;
+			if (get_path_local(path, path_len, "datafiles", subfolder, ver)) break;
+			if (get_path_system(path, path_len, "datafiles", subfolder, "BLENDER_SYSTEM_DATAFILES", ver)) break;
 			return NULL;
 			
 		case BLENDER_USER_DATAFILES:
-			if (get_path_user(path, "datafiles", subfolder, "BLENDER_USER_DATAFILES", ver)) break;
+			if (get_path_user(path, path_len, "datafiles", subfolder, "BLENDER_USER_DATAFILES", ver)) break;
 			return NULL;
 			
 		case BLENDER_SYSTEM_DATAFILES:
-			if (get_path_local(path, "datafiles", subfolder, ver)) break;
-			if (get_path_system(path, "datafiles", subfolder, "BLENDER_SYSTEM_DATAFILES", ver)) break;
+			if (get_path_local(path, path_len, "datafiles", subfolder, ver)) break;
+			if (get_path_system(path, path_len, "datafiles", subfolder, "BLENDER_SYSTEM_DATAFILES", ver)) break;
 			return NULL;
 			
 		case BLENDER_USER_AUTOSAVE:
-			if (get_path_user(path, "autosave", subfolder, "BLENDER_USER_DATAFILES", ver)) break;
+			if (get_path_user(path, path_len, "autosave", subfolder, "BLENDER_USER_DATAFILES", ver)) break;
 			return NULL;
 
 		case BLENDER_USER_CONFIG:
-			if (get_path_user(path, "config", subfolder, "BLENDER_USER_CONFIG", ver)) break;
+			if (get_path_user(path, path_len, "config", subfolder, "BLENDER_USER_CONFIG", ver)) break;
 			return NULL;
 			
 		case BLENDER_USER_SCRIPTS:
-			if (get_path_user(path, "scripts", subfolder, "BLENDER_USER_SCRIPTS", ver)) break;
+			if (get_path_user(path, path_len, "scripts", subfolder, "BLENDER_USER_SCRIPTS", ver)) break;
 			return NULL;
 			
 		case BLENDER_SYSTEM_SCRIPTS:
-			if (get_path_local(path, "scripts", subfolder, ver)) break;
-			if (get_path_system(path, "scripts", subfolder, "BLENDER_SYSTEM_SCRIPTS", ver)) break;
+			if (get_path_local(path, path_len, "scripts", subfolder, ver)) break;
+			if (get_path_system(path, path_len, "scripts", subfolder, "BLENDER_SYSTEM_SCRIPTS", ver)) break;
 			return NULL;
 			
 		case BLENDER_SYSTEM_PYTHON:
-			if (get_path_local(path, "python", subfolder, ver)) break;
-			if (get_path_system(path, "python", subfolder, "BLENDER_SYSTEM_PYTHON", ver)) break;
+			if (get_path_local(path, path_len, "python", subfolder, ver)) break;
+			if (get_path_system(path, path_len, "python", subfolder, "BLENDER_SYSTEM_PYTHON", ver)) break;
 			return NULL;
 
 		default:
@@ -406,6 +427,13 @@ const char *BKE_appdir_folder_id(const int folder_id, const char *subfolder)
 	return path;
 }
 
+const char *BKE_appdir_folder_id(
+        const int folder_id, const char *subfolder)
+{
+	static char path[FILE_MAX] = "";
+	return BKE_appdir_folder_id_ex(folder_id, subfolder, path, sizeof(path));
+}
+
 /**
  * Returns the path to a folder in the user area without checking that it actually exists first.
  */
@@ -416,16 +444,16 @@ const char *BKE_appdir_folder_id_user_notest(const int folder_id, const char *su
 
 	switch (folder_id) {
 		case BLENDER_USER_DATAFILES:
-			get_path_user(path, "datafiles", subfolder, "BLENDER_USER_DATAFILES", ver);
+			get_path_user(path, sizeof(path), "datafiles", subfolder, "BLENDER_USER_DATAFILES", ver);
 			break;
 		case BLENDER_USER_CONFIG:
-			get_path_user(path, "config", subfolder, "BLENDER_USER_CONFIG", ver);
+			get_path_user(path, sizeof(path), "config", subfolder, "BLENDER_USER_CONFIG", ver);
 			break;
 		case BLENDER_USER_AUTOSAVE:
-			get_path_user(path, "autosave", subfolder, "BLENDER_USER_AUTOSAVE", ver);
+			get_path_user(path, sizeof(path), "autosave", subfolder, "BLENDER_USER_AUTOSAVE", ver);
 			break;
 		case BLENDER_USER_SCRIPTS:
-			get_path_user(path, "scripts", subfolder, "BLENDER_USER_SCRIPTS", ver);
+			get_path_user(path, sizeof(path), "scripts", subfolder, "BLENDER_USER_SCRIPTS", ver);
 			break;
 		default:
 			BLI_assert(0);
@@ -469,13 +497,13 @@ const char *BKE_appdir_folder_id_version(const int folder_id, const int ver, con
 	bool ok;
 	switch (folder_id) {
 		case BLENDER_RESOURCE_PATH_USER:
-			ok = get_path_user(path, NULL, NULL, NULL, ver);
+			ok = get_path_user(path, sizeof(path), NULL, NULL, NULL, ver);
 			break;
 		case BLENDER_RESOURCE_PATH_LOCAL:
-			ok = get_path_local(path, NULL, NULL, ver);
+			ok = get_path_local(path, sizeof(path), NULL, NULL, ver);
 			break;
 		case BLENDER_RESOURCE_PATH_SYSTEM:
-			ok = get_path_system(path, NULL, NULL, NULL, ver);
+			ok = get_path_system(path, sizeof(path), NULL, NULL, NULL, ver);
 			break;
 		default:
 			path[0] = '\0'; /* in case do_check is false */
@@ -655,6 +683,48 @@ bool BKE_appdir_program_python_search(
 	return is_found;
 }
 
+static const char *app_template_directory_search[2] = {
+	"startup" SEP_STR "bl_app_templates_user",
+	"startup" SEP_STR "bl_app_templates_system",
+};
+
+static const int app_template_directory_id[2] = {
+	BLENDER_USER_SCRIPTS,
+	BLENDER_SYSTEM_SCRIPTS,
+};
+
+/**
+ * Return true if templates exist
+ */
+bool BKE_appdir_app_template_any(void)
+{
+	char temp_dir[FILE_MAX];
+	for (int i = 0; i < 2; i++) {
+		if (BKE_appdir_folder_id_ex(
+		        app_template_directory_id[i], app_template_directory_search[i],
+		        temp_dir, sizeof(temp_dir)))
+		{
+			return true;
+		}
+	}
+	return false;
+}
+
+bool BKE_appdir_app_template_id_search(const char *app_template, char *path, size_t path_len)
+{
+	for (int i = 0; i < 2; i++) {
+		char subdir[FILE_MAX];
+		BLI_join_dirfile(subdir, sizeof(subdir), app_template_directory_search[i], app_template);
+		if (BKE_appdir_folder_id_ex(
+		        app_template_directory_id[i], subdir,
+		        path, path_len))
+		{
+			return true;
+		}
+	}
+	return false;
+}
+
 /**
  * Gets the temp directory when blender first runs.
  * If the default path is not found, use try $TEMP
@@ -713,7 +783,8 @@ static void where_is_temp(char *fullname, char *basename, const size_t maxlen, c
 		BLI_add_slash(fullname);
 #ifdef WIN32
 		if (userdir && userdir != fullname) {
-			BLI_strncpy(userdir, fullname, maxlen); /* also set user pref to show %TEMP%. /tmp/ is just plain confusing for Windows users. */
+			/* also set user pref to show %TEMP%. /tmp/ is just plain confusing for Windows users. */
+			BLI_strncpy(userdir, fullname, maxlen);
 		}
 #endif
 	}
@@ -755,7 +826,6 @@ static void where_is_temp(char *fullname, char *basename, const size_t maxlen, c
 void BKE_tempdir_init(char *userdir)
 {
 	where_is_temp(btempdir_session, btempdir_base, FILE_MAX, userdir);
-;
 }
 
 /**
diff --git a/source/blender/blenkernel/intern/armature.c b/source/blender/blenkernel/intern/armature.c
index 0287d6ae9ca..02cc31e5977 100644
--- a/source/blender/blenkernel/intern/armature.c
+++ b/source/blender/blenkernel/intern/armature.c
@@ -1037,6 +1037,17 @@ void armature_deform_verts(Object *armOb, Object *target, DerivedMesh *dm, float
 			if (use_dverts) {
 				defnrToPC = MEM_callocN(sizeof(*defnrToPC) * defbase_tot, "defnrToBone");
 				defnrToPCIndex = MEM_callocN(sizeof(*defnrToPCIndex) * defbase_tot, "defnrToIndex");
+				/* TODO(sergey): Some considerations here:
+				 *
+				 * - Make it more generic function, maybe even keep together with chanhash.
+				 * - Check whether keeping this consistent across frames gives speedup.
+				 * - Don't use hash for small armatures.
+				 */
+				GHash *idx_hash = BLI_ghash_ptr_new("pose channel index by name");
+				int pchan_index = 0;
+				for (pchan = armOb->pose->chanbase.first; pchan != NULL; pchan = pchan->next, ++pchan_index) {
+					BLI_ghash_insert(idx_hash, pchan, SET_INT_IN_POINTER(pchan_index));
+				}
 				for (i = 0, dg = target->defbase.first; dg; i++, dg = dg->next) {
 					defnrToPC[i] = BKE_pose_channel_find_name(armOb->pose, dg->name);
 					/* exclude non-deforming bones */
@@ -1045,10 +1056,11 @@ void armature_deform_verts(Object *armOb, Object *target, DerivedMesh *dm, float
 							defnrToPC[i] = NULL;
 						}
 						else {
-							defnrToPCIndex[i] = BLI_findindex(&armOb->pose->chanbase, defnrToPC[i]);
+							defnrToPCIndex[i] = GET_INT_FROM_POINTER(BLI_ghash_lookup(idx_hash, defnrToPC[i]));
 						}
 					}
 				}
+				BLI_ghash_free(idx_hash, NULL, NULL);
 			}
 		}
 	}
diff --git a/source/blender/blenkernel/intern/blender.c b/source/blender/blenkernel/intern/blender.c
index a8670395fc4..ceb641073e0 100644
--- a/source/blender/blenkernel/intern/blender.c
+++ b/source/blender/blenkernel/intern/blender.c
@@ -100,6 +100,18 @@ void BKE_blender_free(void)
 	free_nodesystem();
 }
 
+void BKE_blender_version_string(char *version_str, size_t maxncpy, short version, short subversion, bool v_prefix, bool include_subversion)
+{
+	const char *prefix = v_prefix ? "v" : "";
+
+	if (include_subversion && subversion > 0) {
+		BLI_snprintf(version_str, maxncpy, "%s%d.%02d.%d", prefix, version / 100, version % 100, subversion);
+	}
+	else {
+		BLI_snprintf(version_str, maxncpy, "%s%d.%02d", prefix, version / 100, version % 100);
+	}
+}
+
 void BKE_blender_globals_init(void)
 {
 	memset(&G, 0, sizeof(Global));
@@ -110,10 +122,7 @@ void BKE_blender_globals_init(void)
 
 	strcpy(G.ima, "//");
 
-	if (BLENDER_SUBVERSION)
-		BLI_snprintf(versionstr, sizeof(versionstr), "v%d.%02d.%d", BLENDER_VERSION / 100, BLENDER_VERSION % 100, BLENDER_SUBVERSION);
-	else
-		BLI_snprintf(versionstr, sizeof(versionstr), "v%d.%02d", BLENDER_VERSION / 100, BLENDER_VERSION % 100);
+	BKE_blender_version_string(versionstr, sizeof(versionstr), BLENDER_VERSION, BLENDER_SUBVERSION, true, true);
 
 #ifndef WITH_PYTHON_SECURITY /* default */
 	G.f |= G_SCRIPT_AUTOEXEC;
@@ -141,20 +150,18 @@ static void keymap_item_free(wmKeyMapItem *kmi)
 		MEM_freeN(kmi->ptr);
 }
 
-/**
- * When loading a new userdef from file,
- * or when exiting Blender.
- */
-void BKE_blender_userdef_free(void)
+void BKE_blender_userdef_set_data(UserDef *userdef)
 {
-	wmKeyMap *km;
-	wmKeyMapItem *kmi;
-	wmKeyMapDiffItem *kmdi;
-	bAddon *addon, *addon_next;
-	uiFont *font;
-
-	for (km = U.user_keymaps.first; km; km = km->next) {
-		for (kmdi = km->diff_items.first; kmdi; kmdi = kmdi->next) {
+	/* only here free userdef themes... */
+	BKE_blender_userdef_free_data(&U);
+	U = *userdef;
+}
+
+static void userdef_free_keymaps(UserDef *userdef)
+{
+	for (wmKeyMap *km = userdef->user_keymaps.first, *km_next; km; km = km_next) {
+		km_next = km->next;
+		for (wmKeyMapDiffItem *kmdi = km->diff_items.first; kmdi; kmdi = kmdi->next) {
 			if (kmdi->add_item) {
 				keymap_item_free(kmdi->add_item);
 				MEM_freeN(kmdi->add_item);
@@ -165,14 +172,21 @@ void BKE_blender_userdef_free(void)
 			}
 		}
 
-		for (kmi = km->items.first; kmi; kmi = kmi->next)
+		for (wmKeyMapItem *kmi = km->items.first; kmi; kmi = kmi->next) {
 			keymap_item_free(kmi);
+		}
 
 		BLI_freelistN(&km->diff_items);
 		BLI_freelistN(&km->items);
+
+		MEM_freeN(km);
 	}
-	
-	for (addon = U.addons.first; addon; addon = addon_next) {
+	BLI_listbase_clear(&userdef->user_keymaps);
+}
+
+static void userdef_free_addons(UserDef *userdef)
+{
+	for (bAddon *addon = userdef->addons.first, *addon_next; addon; addon = addon_next) {
 		addon_next = addon->next;
 		if (addon->prop) {
 			IDP_FreeProperty(addon->prop);
@@ -180,19 +194,35 @@ void BKE_blender_userdef_free(void)
 		}
 		MEM_freeN(addon);
 	}
+	BLI_listbase_clear(&userdef->addons);
+}
 
-	for (font = U.uifonts.first; font; font = font->next) {
+/**
+ * When loading a new userdef from file,
+ * or when exiting Blender.
+ */
+void BKE_blender_userdef_free_data(UserDef *userdef)
+{
+#define U _invalid_access_ /* ensure no accidental global access */
+#ifdef U  /* quiet warning */
+#endif
+
+	userdef_free_keymaps(userdef);
+	userdef_free_addons(userdef);
+
+	for (uiFont *font = userdef->uifonts.first; font; font = font->next) {
 		BLF_unload_id(font->blf_id);
 	}
 
 	BLF_default_set(-1);
 
-	BLI_freelistN(&U.autoexec_paths);
+	BLI_freelistN(&userdef->autoexec_paths);
+
+	BLI_freelistN(&userdef->uistyles);
+	BLI_freelistN(&userdef->uifonts);
+	BLI_freelistN(&userdef->themes);
 
-	BLI_freelistN(&U.uistyles);
-	BLI_freelistN(&U.uifonts);
-	BLI_freelistN(&U.themes);
-	BLI_freelistN(&U.user_keymaps);
+#undef U
 }
 
 /**
@@ -208,6 +238,44 @@ void BKE_blender_userdef_refresh(void)
 
 }
 
+/**
+ * Write U from userdef.
+ * This function defines which settings a template will override for the user preferences.
+ */
+void BKE_blender_userdef_set_app_template(UserDef *userdef)
+{
+	/* TODO:
+	 * - keymaps
+	 * - various minor settings (add as needed).
+	 */
+
+#define LIST_OVERRIDE(id) { \
+	BLI_freelistN(&U.id); \
+	BLI_movelisttolist(&U.id, &userdef->id); \
+} ((void)0)
+
+#define MEMCPY_OVERRIDE(id) \
+	memcpy(U.id, userdef->id, sizeof(U.id));
+
+	/* for some types we need custom free functions */
+	userdef_free_addons(&U);
+	userdef_free_keymaps(&U);
+
+	LIST_OVERRIDE(uistyles);
+	LIST_OVERRIDE(uifonts);
+	LIST_OVERRIDE(themes);
+	LIST_OVERRIDE(addons);
+	LIST_OVERRIDE(user_keymaps);
+
+	MEMCPY_OVERRIDE(light);
+
+	MEMCPY_OVERRIDE(font_path_ui);
+	MEMCPY_OVERRIDE(font_path_ui_mono);
+
+#undef LIST_OVERRIDE
+#undef MEMCPY_OVERRIDE
+}
+
 /* *****************  testing for break ************* */
 
 static void (*blender_test_break_cb)(void) = NULL;
@@ -234,7 +302,7 @@ int BKE_blender_test_break(void)
  * \note Don't use MEM_mallocN so functions can be registered at any time.
  * \{ */
 
-struct AtExitData {
+static struct AtExitData {
 	struct AtExitData *next;
 
 	void (*func)(void *user_data);
diff --git a/source/blender/blenkernel/intern/blender_undo.c b/source/blender/blenkernel/intern/blender_undo.c
index bc98d6f6805..e3a5edb2049 100644
--- a/source/blender/blenkernel/intern/blender_undo.c
+++ b/source/blender/blenkernel/intern/blender_undo.c
@@ -108,9 +108,9 @@ static int read_undosave(bContext *C, UndoElem *uel)
 	G.fileflags |= G_FILE_NO_UI;
 
 	if (UNDO_DISK)
-		success = (BKE_blendfile_read(C, uel->str, NULL) != BKE_BLENDFILE_READ_FAIL);
+		success = (BKE_blendfile_read(C, uel->str, NULL, 0) != BKE_BLENDFILE_READ_FAIL);
 	else
-		success = BKE_blendfile_read_from_memfile(C, &uel->memfile, NULL);
+		success = BKE_blendfile_read_from_memfile(C, &uel->memfile, NULL, 0);
 
 	/* restore */
 	BLI_strncpy(G.main->name, mainstr, sizeof(G.main->name)); /* restore */
@@ -389,7 +389,7 @@ bool BKE_undo_save_file(const char *filename)
 Main *BKE_undo_get_main(Scene **r_scene)
 {
 	Main *mainp = NULL;
-	BlendFileData *bfd = BLO_read_from_memfile(G.main, G.main->name, &curundo->memfile, NULL);
+	BlendFileData *bfd = BLO_read_from_memfile(G.main, G.main->name, &curundo->memfile, NULL, BLO_READ_SKIP_NONE);
 
 	if (bfd) {
 		mainp = bfd->main;
diff --git a/source/blender/blenkernel/intern/blendfile.c b/source/blender/blenkernel/intern/blendfile.c
index 54f709a1e5b..a521d671ea4 100644
--- a/source/blender/blenkernel/intern/blendfile.c
+++ b/source/blender/blenkernel/intern/blendfile.c
@@ -227,7 +227,7 @@ static void setup_app_data(
 	if (bfd->user) {
 
 		/* only here free userdef themes... */
-		BKE_blender_userdef_free();
+		BKE_blender_userdef_free_data(&U);
 
 		U = *bfd->user;
 
@@ -349,17 +349,23 @@ static int handle_subversion_warning(Main *main, ReportList *reports)
 	return 1;
 }
 
-int BKE_blendfile_read(bContext *C, const char *filepath, ReportList *reports)
+int BKE_blendfile_read(
+        bContext *C, const char *filepath,
+        ReportList *reports, int skip_flags)
 {
 	BlendFileData *bfd;
 	int retval = BKE_BLENDFILE_READ_OK;
 
-	if (strstr(filepath, BLENDER_STARTUP_FILE) == NULL) /* don't print user-pref loading */
-		printf("read blend: %s\n", filepath);
+	/* don't print user-pref loading */
+	if (strstr(filepath, BLENDER_STARTUP_FILE) == NULL) {
+		printf("Read blend: %s\n", filepath);
+	}
 
-	bfd = BLO_read_from_file(filepath, reports);
+	bfd = BLO_read_from_file(filepath, reports, skip_flags);
 	if (bfd) {
-		if (bfd->user) retval = BKE_BLENDFILE_READ_OK_USERPREFS;
+		if (bfd->user) {
+			retval = BKE_BLENDFILE_READ_OK_USERPREFS;
+		}
 
 		if (0 == handle_subversion_warning(bfd->main, reports)) {
 			BKE_main_free(bfd->main);
@@ -379,11 +385,11 @@ int BKE_blendfile_read(bContext *C, const char *filepath, ReportList *reports)
 
 bool BKE_blendfile_read_from_memory(
         bContext *C, const void *filebuf, int filelength,
-        ReportList *reports, bool update_defaults)
+        ReportList *reports, int skip_flags, bool update_defaults)
 {
 	BlendFileData *bfd;
 
-	bfd = BLO_read_from_memory(filebuf, filelength, reports);
+	bfd = BLO_read_from_memory(filebuf, filelength, reports, skip_flags);
 	if (bfd) {
 		if (update_defaults)
 			BLO_update_defaults_startup_blend(bfd->main);
@@ -399,11 +405,11 @@ bool BKE_blendfile_read_from_memory(
 /* memfile is the undo buffer */
 bool BKE_blendfile_read_from_memfile(
         bContext *C, struct MemFile *memfile,
-        ReportList *reports)
+        ReportList *reports, int skip_flags)
 {
 	BlendFileData *bfd;
 
-	bfd = BLO_read_from_memfile(CTX_data_main(C), G.main->name, memfile, reports);
+	bfd = BLO_read_from_memfile(CTX_data_main(C), G.main->name, memfile, reports, skip_flags);
 	if (bfd) {
 		/* remove the unused screens and wm */
 		while (bfd->main->wm.first)
@@ -420,32 +426,76 @@ bool BKE_blendfile_read_from_memfile(
 	return (bfd != NULL);
 }
 
+/**
+ * Utility to make a file 'empty' used for startup to optionally give an empty file.
+ * Handy for tests.
+ */
+void BKE_blendfile_read_make_empty(bContext *C)
+{
+	Main *bmain = CTX_data_main(C);
+
+	ListBase *lbarray[MAX_LIBARRAY];
+	ID *id;
+	int a;
+
+	a = set_listbasepointers(bmain, lbarray);
+	while (a--) {
+		id = lbarray[a]->first;
+		if (id != NULL) {
+			if (ELEM(GS(id->name), ID_SCE, ID_SCR, ID_WM)) {
+				continue;
+			}
+			while ((id = lbarray[a]->first)) {
+				BKE_libblock_delete(bmain, id);
+			}
+		}
+	}
+}
+
 /* only read the userdef from a .blend */
-int BKE_blendfile_read_userdef(const char *filepath, ReportList *reports)
+UserDef *BKE_blendfile_userdef_read(const char *filepath, ReportList *reports)
 {
 	BlendFileData *bfd;
-	int retval = BKE_BLENDFILE_READ_FAIL;
+	UserDef *userdef = NULL;
 
-	bfd = BLO_read_from_file(filepath, reports);
+	bfd = BLO_read_from_file(filepath, reports, BLO_READ_SKIP_ALL & ~BLO_READ_SKIP_USERDEF);
 	if (bfd) {
 		if (bfd->user) {
-			retval = BKE_BLENDFILE_READ_OK_USERPREFS;
+			userdef = bfd->user;
+		}
+		BKE_main_free(bfd->main);
+		MEM_freeN(bfd);
+	}
 
-			/* only here free userdef themes... */
-			BKE_blender_userdef_free();
+	return userdef;
+}
 
-			U = *bfd->user;
-			MEM_freeN(bfd->user);
+
+UserDef *BKE_blendfile_userdef_read_from_memory(
+        const void *filebuf, int filelength,
+        ReportList *reports)
+{
+	BlendFileData *bfd;
+	UserDef *userdef = NULL;
+
+	bfd = BLO_read_from_memory(filebuf, filelength, reports, BLO_READ_SKIP_ALL & ~BLO_READ_SKIP_USERDEF);
+	if (bfd) {
+		if (bfd->user) {
+			userdef = bfd->user;
 		}
 		BKE_main_free(bfd->main);
 		MEM_freeN(bfd);
 	}
+	else {
+		BKE_reports_prepend(reports, "Loading failed: ");
+	}
 
-	return retval;
+	return userdef;
 }
 
+
 /* only write the userdef in a .blend */
-int BKE_blendfile_write_userdef(const char *filepath, ReportList *reports)
+int BKE_blendfile_userdef_write(const char *filepath, ReportList *reports)
 {
 	Main *mainb = MEM_callocN(sizeof(Main), "empty main");
 	int retval = 0;
diff --git a/source/blender/blenkernel/intern/bvhutils.c b/source/blender/blenkernel/intern/bvhutils.c
index c1dd3c0a5d8..c0e4ef37a93 100644
--- a/source/blender/blenkernel/intern/bvhutils.c
+++ b/source/blender/blenkernel/intern/bvhutils.c
@@ -393,7 +393,6 @@ static BVHTree *bvhtree_from_editmesh_verts_create_tree(
         BMEditMesh *em, const int verts_num,
         const BLI_bitmap *verts_mask, int verts_num_active)
 {
-	BVHTree *tree = NULL;
 	BM_mesh_elem_table_ensure(em->bm, BM_VERT);
 	if (verts_mask) {
 		BLI_assert(IN_RANGE_INCL(verts_num_active, 0, verts_num));
@@ -402,14 +401,15 @@ static BVHTree *bvhtree_from_editmesh_verts_create_tree(
 		verts_num_active = verts_num;
 	}
 
-	tree = BLI_bvhtree_new(verts_num_active, epsilon, tree_type, axis);
+	BVHTree *tree = BLI_bvhtree_new(verts_num_active, epsilon, tree_type, axis);
 
 	if (tree) {
 		for (int i = 0; i < verts_num; i++) {
-			if (!verts_mask || BLI_BITMAP_TEST_BOOL(verts_mask, i)) {
-				BMVert *eve = BM_vert_at_index(em->bm, i);
-				BLI_bvhtree_insert(tree, i, eve->co, 1);
+			if (verts_mask && !BLI_BITMAP_TEST_BOOL(verts_mask, i)) {
+				continue;
 			}
+			BMVert *eve = BM_vert_at_index(em->bm, i);
+			BLI_bvhtree_insert(tree, i, eve->co, 1);
 		}
 		BLI_assert(BLI_bvhtree_get_size(tree) == verts_num_active);
 		BLI_bvhtree_balance(tree);
@@ -420,31 +420,28 @@ static BVHTree *bvhtree_from_editmesh_verts_create_tree(
 
 static BVHTree *bvhtree_from_mesh_verts_create_tree(
         float epsilon, int tree_type, int axis,
-        MVert *vert, const int verts_num,
+        const MVert *vert, const int verts_num,
         const BLI_bitmap *verts_mask, int verts_num_active)
 {
-	BVHTree *tree = NULL;
-	int i;
-	if (vert) {
-		if (verts_mask) {
-			BLI_assert(IN_RANGE_INCL(verts_num_active, 0, verts_num));
-		}
-		else {
-			verts_num_active = verts_num;
-		}
+	BLI_assert(vert != NULL);
+	if (verts_mask) {
+		BLI_assert(IN_RANGE_INCL(verts_num_active, 0, verts_num));
+	}
+	else {
+		verts_num_active = verts_num;
+	}
 
-		tree = BLI_bvhtree_new(verts_num_active, epsilon, tree_type, axis);
+	BVHTree *tree = BLI_bvhtree_new(verts_num_active, epsilon, tree_type, axis);
 
-		if (tree) {
-			for (i = 0; i < verts_num; i++) {
-				if (verts_mask && !BLI_BITMAP_TEST_BOOL(verts_mask, i)) {
-					continue;
-				}
-				BLI_bvhtree_insert(tree, i, vert[i].co, 1);
+	if (tree) {
+		for (int i = 0; i < verts_num; i++) {
+			if (verts_mask && !BLI_BITMAP_TEST_BOOL(verts_mask, i)) {
+				continue;
 			}
-			BLI_assert(BLI_bvhtree_get_size(tree) == verts_num_active);
-			BLI_bvhtree_balance(tree);
+			BLI_bvhtree_insert(tree, i, vert[i].co, 1);
 		}
+		BLI_assert(BLI_bvhtree_get_size(tree) == verts_num_active);
+		BLI_bvhtree_balance(tree);
 	}
 
 	return tree;
@@ -452,31 +449,23 @@ static BVHTree *bvhtree_from_mesh_verts_create_tree(
 
 static void bvhtree_from_mesh_verts_setup_data(
         BVHTreeFromMesh *data, BVHTree *tree, const bool is_cached, float epsilon,
-        MVert *vert, const bool vert_allocated)
+        const MVert *vert, const bool vert_allocated)
 {
 	memset(data, 0, sizeof(*data));
 
-	if (tree) {
-		data->tree = tree;
-		data->cached = is_cached;
+	data->tree = tree;
+	data->cached = is_cached;
 
-		/* a NULL nearest callback works fine
-		 * remember the min distance to point is the same as the min distance to BV of point */
-		data->nearest_callback = NULL;
-		data->raycast_callback = mesh_verts_spherecast;
-		data->nearest_to_ray_callback = NULL;
+	/* a NULL nearest callback works fine
+	 * remember the min distance to point is the same as the min distance to BV of point */
+	data->nearest_callback = NULL;
+	data->raycast_callback = mesh_verts_spherecast;
 
-		data->vert = vert;
-		data->vert_allocated = vert_allocated;
-		//data->face = DM_get_tessface_array(dm, &data->face_allocated);  /* XXX WHY???? */
+	data->vert = vert;
+	data->vert_allocated = vert_allocated;
+	//data->face = DM_get_tessface_array(dm, &data->face_allocated);  /* XXX WHY???? */
 
-		data->sphere_radius = epsilon;
-	}
-	else {
-		if (vert_allocated) {
-			MEM_freeN(vert);
-		}
-	}
+	data->sphere_radius = epsilon;
 }
 
 /* Builds a bvh tree where nodes are the vertices of the given em */
@@ -485,11 +474,9 @@ BVHTree *bvhtree_from_editmesh_verts_ex(
         const BLI_bitmap *verts_mask, int verts_num_active,
         float epsilon, int tree_type, int axis)
 {
-	int vert_num = em->bm->totvert;
-
 	BVHTree *tree = bvhtree_from_editmesh_verts_create_tree(
 	        epsilon, tree_type, axis,
-	        em, vert_num, verts_mask, verts_num_active);
+	        em, em->bm->totvert, verts_mask, verts_num_active);
 
 	if (tree) {
 		memset(data, 0, sizeof(*data));
@@ -497,11 +484,11 @@ BVHTree *bvhtree_from_editmesh_verts_ex(
 		data->em = em;
 		data->nearest_callback = NULL;
 		data->raycast_callback = editmesh_verts_spherecast;
-		data->nearest_to_ray_callback = NULL;
 	}
 
 	return tree;
 }
+
 BVHTree *bvhtree_from_editmesh_verts(
         BVHTreeFromEditMesh *data, BMEditMesh *em,
         float epsilon, int tree_type, int axis)
@@ -512,8 +499,8 @@ BVHTree *bvhtree_from_editmesh_verts(
 	        epsilon, tree_type, axis);
 }
 
-
-/* Builds a bvh tree where nodes are the vertices of the given dm */
+/* Builds a bvh tree where nodes are the vertices of the given dm
+ * and stores the BVHTree in dm->bvhCache */
 BVHTree *bvhtree_from_mesh_verts(
         BVHTreeFromMesh *data, DerivedMesh *dm,
         float epsilon, int tree_type, int axis)
@@ -553,10 +540,18 @@ BVHTree *bvhtree_from_mesh_verts(
 		/* printf("BVHTree is already build, using cached tree\n"); */
 	}
 
-	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_verts_setup_data(data, tree, true, epsilon, vert, vert_allocated);
-
-	return data->tree;
+	if (tree) {
+		/* Setup BVHTreeFromMesh */
+		bvhtree_from_mesh_verts_setup_data(
+		        data, tree, true, epsilon, vert, vert_allocated);
+	}
+	else {
+		if (vert_allocated) {
+			MEM_freeN(vert);
+		}
+		memset(data, 0, sizeof(*data));
+	}
+	return tree;
 }
 
 /**
@@ -566,7 +561,7 @@ BVHTree *bvhtree_from_mesh_verts(
  * \param verts_num_active if >= 0, number of active verts to add to BVH tree (else will be computed from mask).
  */
 BVHTree *bvhtree_from_mesh_verts_ex(
-        BVHTreeFromMesh *data, MVert *vert, const int verts_num, const bool vert_allocated,
+        BVHTreeFromMesh *data, const MVert *vert, const int verts_num, const bool vert_allocated,
         const BLI_bitmap *verts_mask, int verts_num_active,
         float epsilon, int tree_type, int axis)
 {
@@ -574,9 +569,10 @@ BVHTree *bvhtree_from_mesh_verts_ex(
 	        epsilon, tree_type, axis, vert, verts_num, verts_mask, verts_num_active);
 
 	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_verts_setup_data(data, tree, false, epsilon, vert, vert_allocated);
+	bvhtree_from_mesh_verts_setup_data(
+	        data, tree, false, epsilon, vert, vert_allocated);
 
-	return data->tree;
+	return tree;
 }
 
 /** \} */
@@ -592,8 +588,6 @@ static BVHTree *bvhtree_from_editmesh_edges_create_tree(
         BMEditMesh *em, const int edges_num,
         const BLI_bitmap *edges_mask, int edges_num_active)
 {
-	BVHTree *tree = NULL;
-	int i;
 	BM_mesh_elem_table_ensure(em->bm, BM_EDGE);
 	if (edges_mask) {
 		BLI_assert(IN_RANGE_INCL(edges_num_active, 0, edges_num));
@@ -602,9 +596,10 @@ static BVHTree *bvhtree_from_editmesh_edges_create_tree(
 		edges_num_active = edges_num;
 	}
 
-	tree = BLI_bvhtree_new(edges_num_active, epsilon, tree_type, axis);
+	BVHTree *tree = BLI_bvhtree_new(edges_num_active, epsilon, tree_type, axis);
 
 	if (tree) {
+		int i;
 		BMIter iter;
 		BMEdge *eed;
 		BM_ITER_MESH_INDEX (eed, &iter, em->bm, BM_EDGES_OF_MESH, i) {
@@ -624,6 +619,62 @@ static BVHTree *bvhtree_from_editmesh_edges_create_tree(
 	return tree;
 }
 
+static BVHTree *bvhtree_from_mesh_edges_create_tree(
+        const MVert *vert, const MEdge *edge, const int edge_num,
+        const BLI_bitmap *edges_mask, int edges_num_active,
+        float epsilon, int tree_type, int axis)
+{
+	if (edges_mask) {
+		BLI_assert(IN_RANGE_INCL(edges_num_active, 0, edge_num));
+	}
+	else {
+		edges_num_active = edge_num;
+	}
+	BLI_assert(vert != NULL);
+	BLI_assert(edge != NULL);
+
+	/* Create a bvh-tree of the given target */
+	BVHTree *tree = BLI_bvhtree_new(edges_num_active, epsilon, tree_type, axis);
+	if (tree) {
+		for (int i = 0; i < edge_num; i++) {
+			if (edges_mask && !BLI_BITMAP_TEST_BOOL(edges_mask, i)) {
+				continue;
+			}
+			float co[2][3];
+			copy_v3_v3(co[0], vert[edge[i].v1].co);
+			copy_v3_v3(co[1], vert[edge[i].v2].co);
+
+			BLI_bvhtree_insert(tree, i, co[0], 2);
+		}
+		BLI_bvhtree_balance(tree);
+	}
+
+	return tree;
+}
+
+static void bvhtree_from_mesh_edges_setup_data(
+        BVHTreeFromMesh *data, BVHTree *tree,
+        const bool is_cached, float epsilon,
+        const MVert *vert, const bool vert_allocated,
+        const MEdge *edge, const bool edge_allocated)
+{
+	memset(data, 0, sizeof(*data));
+
+	data->tree = tree;
+
+	data->cached = is_cached;
+
+	data->nearest_callback = mesh_edges_nearest_point;
+	data->raycast_callback = mesh_edges_spherecast;
+
+	data->vert = vert;
+	data->vert_allocated = vert_allocated;
+	data->edge = edge;
+	data->edge_allocated = edge_allocated;
+
+	data->sphere_radius = epsilon;
+}
+
 /* Builds a bvh tree where nodes are the edges of the given em */
 BVHTree *bvhtree_from_editmesh_edges_ex(
         BVHTreeFromEditMesh *data, BMEditMesh *em,
@@ -642,12 +693,11 @@ BVHTree *bvhtree_from_editmesh_edges_ex(
 		data->em = em;
 		data->nearest_callback = NULL;  /* TODO */
 		data->raycast_callback = NULL;  /* TODO */
-		/* TODO: not urgent however since users currently define own callbacks */
-		data->nearest_to_ray_callback = NULL;
 	}
 
 	return tree;
 }
+
 BVHTree *bvhtree_from_editmesh_edges(
         BVHTreeFromEditMesh *data, BMEditMesh *em,
         float epsilon, int tree_type, int axis)
@@ -680,27 +730,13 @@ BVHTree *bvhtree_from_mesh_edges(
 		BLI_rw_mutex_lock(&cache_rwlock, THREAD_LOCK_WRITE);
 		tree = bvhcache_find(dm->bvhCache, BVHTREE_FROM_EDGES);
 		if (tree == NULL) {
-			int i;
-			int numEdges = dm->getNumEdges(dm);
-
-			if (vert != NULL && edge != NULL) {
-				/* Create a bvh-tree of the given target */
-				tree = BLI_bvhtree_new(numEdges, epsilon, tree_type, axis);
-				if (tree != NULL) {
-					for (i = 0; i < numEdges; i++) {
-						float co[2][3];
-						copy_v3_v3(co[0], vert[edge[i].v1].co);
-						copy_v3_v3(co[1], vert[edge[i].v2].co);
-
-						BLI_bvhtree_insert(tree, i, co[0], 2);
-					}
-					BLI_bvhtree_balance(tree);
+			tree = bvhtree_from_mesh_edges_create_tree(
+			        vert, edge, dm->getNumEdges(dm),
+			        NULL, -1, epsilon, tree_type, axis);
 
-					/* Save on cache for later use */
-					/* printf("BVHTree built and saved on cache\n"); */
-					bvhcache_insert(&dm->bvhCache, tree, BVHTREE_FROM_EDGES);
-				}
-			}
+			/* Save on cache for later use */
+			/* printf("BVHTree built and saved on cache\n"); */
+			bvhcache_insert(&dm->bvhCache, tree, BVHTREE_FROM_EDGES);
 		}
 		BLI_rw_mutex_unlock(&cache_rwlock);
 	}
@@ -708,24 +744,10 @@ BVHTree *bvhtree_from_mesh_edges(
 		/* printf("BVHTree is already build, using cached tree\n"); */
 	}
 
-
-	/* Setup BVHTreeFromMesh */
-	memset(data, 0, sizeof(*data));
-	data->tree = tree;
-
-	if (data->tree) {
-		data->cached = true;
-
-		data->nearest_callback = mesh_edges_nearest_point;
-		data->raycast_callback = mesh_edges_spherecast;
-		data->nearest_to_ray_callback = NULL;
-
-		data->vert = vert;
-		data->vert_allocated = vert_allocated;
-		data->edge = edge;
-		data->edge_allocated = edge_allocated;
-
-		data->sphere_radius = epsilon;
+	if (tree) {
+		/* Setup BVHTreeFromMesh */
+		bvhtree_from_mesh_edges_setup_data(
+		        data, tree, true, epsilon, vert, vert_allocated, edge, edge_allocated);
 	}
 	else {
 		if (vert_allocated) {
@@ -734,8 +756,33 @@ BVHTree *bvhtree_from_mesh_edges(
 		if (edge_allocated) {
 			MEM_freeN(edge);
 		}
+		memset(data, 0, sizeof(*data));
 	}
-	return data->tree;
+	return tree;
+}
+
+/**
+ * Builds a bvh tree where nodes are the given edges .
+ * \param vert/edge_allocated if true, elem freeing will be done when freeing data.
+ * \param edges_mask if not null, true elements give which vert to add to BVH tree.
+ * \param edges_num_active if >= 0, number of active edges to add to BVH tree (else will be computed from mask).
+ */
+BVHTree *bvhtree_from_mesh_edges_ex(
+        BVHTreeFromMesh *data,
+        const MVert *vert, const bool vert_allocated,
+        const MEdge *edge, const int edges_num, const bool edge_allocated,
+        const BLI_bitmap *edges_mask, int edges_num_active,
+        float epsilon, int tree_type, int axis)
+{
+	BVHTree *tree = bvhtree_from_mesh_edges_create_tree(
+	        vert, edge, edges_num, edges_mask, edges_num_active,
+	        epsilon, tree_type, axis);
+
+	/* Setup BVHTreeFromMesh */
+	bvhtree_from_mesh_edges_setup_data(
+	        data, tree, false, epsilon, vert, vert_allocated, edge, edge_allocated);
+
+	return tree;
 }
 
 /** \} */
@@ -748,7 +795,7 @@ BVHTree *bvhtree_from_mesh_edges(
 
 static BVHTree *bvhtree_from_mesh_faces_create_tree(
         float epsilon, int tree_type, int axis,
-        MVert *vert, MFace *face, const int faces_num,
+        const MVert *vert, const MFace *face, const int faces_num,
         const BLI_bitmap *faces_mask, int faces_num_active)
 {
 	BVHTree *tree = NULL;
@@ -792,34 +839,23 @@ static BVHTree *bvhtree_from_mesh_faces_create_tree(
 
 static void bvhtree_from_mesh_faces_setup_data(
         BVHTreeFromMesh *data, BVHTree *tree, const bool is_cached, float epsilon,
-        MVert *vert, const bool vert_allocated,
-        MFace *face, const bool face_allocated)
+        const MVert *vert, const bool vert_allocated,
+        const MFace *face, const bool face_allocated)
 {
 	memset(data, 0, sizeof(*data));
 
-	if (tree) {
-		data->tree = tree;
-		data->cached = is_cached;
+	data->tree = tree;
+	data->cached = is_cached;
 
-		data->nearest_callback = mesh_faces_nearest_point;
-		data->raycast_callback = mesh_faces_spherecast;
-		data->nearest_to_ray_callback = NULL;
+	data->nearest_callback = mesh_faces_nearest_point;
+	data->raycast_callback = mesh_faces_spherecast;
 
-		data->vert = vert;
-		data->vert_allocated = vert_allocated;
-		data->face = face;
-		data->face_allocated = face_allocated;
+	data->vert = vert;
+	data->vert_allocated = vert_allocated;
+	data->face = face;
+	data->face_allocated = face_allocated;
 
-		data->sphere_radius = epsilon;
-	}
-	else {
-		if (vert_allocated) {
-			MEM_freeN(vert);
-		}
-		if (face_allocated) {
-			MEM_freeN(face);
-		}
-	}
+	data->sphere_radius = epsilon;
 }
 
 /* Builds a bvh tree where nodes are the tesselated faces of the given dm */
@@ -862,10 +898,21 @@ BVHTree *bvhtree_from_mesh_faces(
 		/* printf("BVHTree is already build, using cached tree\n"); */
 	}
 
-	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_faces_setup_data(data, tree, true, epsilon, vert, vert_allocated, face, face_allocated);
-
-	return data->tree;
+	if (tree) {
+		/* Setup BVHTreeFromMesh */
+		bvhtree_from_mesh_faces_setup_data(
+		        data, tree, true, epsilon, vert, vert_allocated, face, face_allocated);
+	}
+	else {
+		if (vert_allocated) {
+			MEM_freeN(vert);
+		}
+		if (face_allocated) {
+			MEM_freeN(face);
+		}
+		memset(data, 0, sizeof(*data));
+	}
+	return tree;
 }
 
 /**
@@ -876,8 +923,8 @@ BVHTree *bvhtree_from_mesh_faces(
  * \param numFaces_active if >= 0, number of active faces to add to BVH tree (else will be computed from mask).
  */
 BVHTree *bvhtree_from_mesh_faces_ex(
-        BVHTreeFromMesh *data, MVert *vert, const bool vert_allocated,
-        MFace *face, const int numFaces, const bool face_allocated,
+        BVHTreeFromMesh *data, const MVert *vert, const bool vert_allocated,
+        const MFace *face, const int numFaces, const bool face_allocated,
         const BLI_bitmap *faces_mask, int faces_num_active,
         float epsilon, int tree_type, int axis)
 {
@@ -887,9 +934,10 @@ BVHTree *bvhtree_from_mesh_faces_ex(
 	        faces_mask, faces_num_active);
 
 	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_faces_setup_data(data, tree, false, epsilon, vert, vert_allocated, face, face_allocated);
+	bvhtree_from_mesh_faces_setup_data(
+	        data, tree, false, epsilon, vert, vert_allocated, face, face_allocated);
 
-	return data->tree;
+	return tree;
 }
 
 /** \} */
@@ -1000,34 +1048,20 @@ static void bvhtree_from_mesh_looptri_setup_data(
 {
 	memset(data, 0, sizeof(*data));
 
-	if (tree) {
-		data->tree = tree;
-		data->cached = is_cached;
+	data->tree = tree;
+	data->cached = is_cached;
 
-		data->nearest_callback = mesh_looptri_nearest_point;
-		data->raycast_callback = mesh_looptri_spherecast;
-		data->nearest_to_ray_callback = NULL;
+	data->nearest_callback = mesh_looptri_nearest_point;
+	data->raycast_callback = mesh_looptri_spherecast;
 
-		data->vert = vert;
-		data->vert_allocated = vert_allocated;
-		data->loop = mloop;
-		data->loop_allocated = loop_allocated;
-		data->looptri = looptri;
-		data->looptri_allocated = looptri_allocated;
+	data->vert = vert;
+	data->vert_allocated = vert_allocated;
+	data->loop = mloop;
+	data->loop_allocated = loop_allocated;
+	data->looptri = looptri;
+	data->looptri_allocated = looptri_allocated;
 
-		data->sphere_radius = epsilon;
-	}
-	else {
-		if (vert_allocated) {
-			MEM_freeN((void *)vert);
-		}
-		if (loop_allocated) {
-			MEM_freeN((void *)mloop);
-		}
-		if (looptri_allocated) {
-			MEM_freeN((void *)looptri);
-		}
-	}
+	data->sphere_radius = epsilon;
 }
 
 /**
@@ -1072,7 +1106,6 @@ BVHTree *bvhtree_from_editmesh_looptri_ex(
 		data->tree = tree;
 		data->nearest_callback = editmesh_looptri_nearest_point;
 		data->raycast_callback = editmesh_looptri_spherecast;
-		data->nearest_to_ray_callback = NULL;
 		data->sphere_radius = 0.0f;
 		data->em = em;
 		data->cached = bvhCache != NULL;
@@ -1154,14 +1187,28 @@ BVHTree *bvhtree_from_mesh_looptri(
 		/* printf("BVHTree is already build, using cached tree\n"); */
 	}
 
-	/* Setup BVHTreeFromMesh */
-	bvhtree_from_mesh_looptri_setup_data(
-	        data, tree, true, epsilon,
-	        mvert, vert_allocated,
-	        mloop, loop_allocated,
-	        looptri, looptri_allocated);
+	if (tree) {
+		/* Setup BVHTreeFromMesh */
+		bvhtree_from_mesh_looptri_setup_data(
+		        data, tree, true, epsilon,
+		        mvert, vert_allocated,
+		        mloop, loop_allocated,
+		        looptri, looptri_allocated);
+	}
+	else {
+		if (vert_allocated) {
+			MEM_freeN(mvert);
+		}
+		if (loop_allocated) {
+			MEM_freeN(mloop);
+		}
+		if (looptri_allocated) {
+			MEM_freeN((void *)looptri);
+		}
+		memset(data, 0, sizeof(*data));
+	}
 
-	return data->tree;
+	return tree;
 }
 
 BVHTree *bvhtree_from_mesh_looptri_ex(
@@ -1184,7 +1231,7 @@ BVHTree *bvhtree_from_mesh_looptri_ex(
 	        mloop, loop_allocated,
 	        looptri, looptri_allocated);
 
-	return data->tree;
+	return tree;
 }
 
 /** \} */
@@ -1204,29 +1251,27 @@ void free_bvhtree_from_editmesh(struct BVHTreeFromEditMesh *data)
 /* Frees data allocated by a call to bvhtree_from_mesh_*. */
 void free_bvhtree_from_mesh(struct BVHTreeFromMesh *data)
 {
-	if (data->tree) {
-		if (!data->cached) {
-			BLI_bvhtree_free(data->tree);
-		}
-
-		if (data->vert_allocated) {
-			MEM_freeN((void *)data->vert);
-		}
-		if (data->edge_allocated) {
-			MEM_freeN((void *)data->edge);
-		}
-		if (data->face_allocated) {
-			MEM_freeN((void *)data->face);
-		}
-		if (data->loop_allocated) {
-			MEM_freeN((void *)data->loop);
-		}
-		if (data->looptri_allocated) {
-			MEM_freeN((void *)data->looptri);
-		}
+	if (data->tree && !data->cached) {
+		BLI_bvhtree_free(data->tree);
+	}
 
-		memset(data, 0, sizeof(*data));
+	if (data->vert_allocated) {
+		MEM_freeN((void *)data->vert);
+	}
+	if (data->edge_allocated) {
+		MEM_freeN((void *)data->edge);
 	}
+	if (data->face_allocated) {
+		MEM_freeN((void *)data->face);
+	}
+	if (data->loop_allocated) {
+		MEM_freeN((void *)data->loop);
+	}
+	if (data->looptri_allocated) {
+		MEM_freeN((void *)data->looptri);
+	}
+
+	memset(data, 0, sizeof(*data));
 }
 
 
diff --git a/source/blender/blenkernel/intern/cdderivedmesh.c b/source/blender/blenkernel/intern/cdderivedmesh.c
index fc3e358cb25..7042b46330b 100644
--- a/source/blender/blenkernel/intern/cdderivedmesh.c
+++ b/source/blender/blenkernel/intern/cdderivedmesh.c
@@ -2398,13 +2398,16 @@ DerivedMesh *CDDM_from_editbmesh(BMEditMesh *em, const bool use_mdisps, const bo
 	        use_tessface, em->tottri, (const BMLoop *(*)[3])em->looptris);
 }
 
-static DerivedMesh *cddm_copy_ex(DerivedMesh *source, int faces_from_tessfaces)
+static DerivedMesh *cddm_copy_ex(DerivedMesh *source,
+                                 const bool need_tessface_data,
+                                 const bool faces_from_tessfaces)
 {
+	const bool copy_tessface_data = (faces_from_tessfaces || need_tessface_data);
 	CDDerivedMesh *cddm = cdDM_create("CDDM_copy cddm");
 	DerivedMesh *dm = &cddm->dm;
 	int numVerts = source->numVertData;
 	int numEdges = source->numEdgeData;
-	int numTessFaces = source->numTessFaceData;
+	int numTessFaces = copy_tessface_data ? source->numTessFaceData : 0;
 	int numLoops = source->numLoopData;
 	int numPolys = source->numPolyData;
 
@@ -2414,20 +2417,29 @@ static DerivedMesh *cddm_copy_ex(DerivedMesh *source, int faces_from_tessfaces)
 	source->getVertDataArray(source, CD_ORIGINDEX);
 	source->getEdgeDataArray(source, CD_ORIGINDEX);
 	source->getPolyDataArray(source, CD_ORIGINDEX);
+	if (copy_tessface_data) {
+		source->getTessFaceDataArray(source, CD_ORIGINDEX);
+	}
 
 	/* this initializes dm, and copies all non mvert/medge/mface layers */
-	DM_from_template(dm, source, DM_TYPE_CDDM, numVerts, numEdges,
-	                 faces_from_tessfaces ? numTessFaces : 0,
+	DM_from_template(dm, source, DM_TYPE_CDDM, numVerts, numEdges, numTessFaces,
 	                 numLoops, numPolys);
 	dm->deformedOnly = source->deformedOnly;
 	dm->cd_flag = source->cd_flag;
 	dm->dirty = source->dirty;
 
-	/* Tessellation data is never copied, so tag it here. */
-	dm->dirty |= DM_DIRTY_TESS_CDLAYERS;
+	/* Tessellation data is never copied, so tag it here.
+	 * Only tag dirty layers if we really ignored tessellation faces.
+	 */
+	if (!copy_tessface_data) {
+		dm->dirty |= DM_DIRTY_TESS_CDLAYERS;
+	}
 
 	CustomData_copy_data(&source->vertData, &dm->vertData, 0, 0, numVerts);
 	CustomData_copy_data(&source->edgeData, &dm->edgeData, 0, 0, numEdges);
+	if (copy_tessface_data) {
+		CustomData_copy_data(&source->faceData, &dm->faceData, 0, 0, numTessFaces);
+	}
 
 	/* now add mvert/medge/mface layers */
 	cddm->mvert = source->dupVertArray(source);
@@ -2435,17 +2447,16 @@ static DerivedMesh *cddm_copy_ex(DerivedMesh *source, int faces_from_tessfaces)
 
 	CustomData_add_layer(&dm->vertData, CD_MVERT, CD_ASSIGN, cddm->mvert, numVerts);
 	CustomData_add_layer(&dm->edgeData, CD_MEDGE, CD_ASSIGN, cddm->medge, numEdges);
-	
-	if (!faces_from_tessfaces) {
-		DM_DupPolys(source, dm);
-	}
-	else {
-		source->getTessFaceDataArray(source, CD_ORIGINDEX);
-		CustomData_copy_data(&source->faceData, &dm->faceData, 0, 0, numTessFaces);
 
+	if (faces_from_tessfaces || copy_tessface_data) {
 		cddm->mface = source->dupTessFaceArray(source);
 		CustomData_add_layer(&dm->faceData, CD_MFACE, CD_ASSIGN, cddm->mface, numTessFaces);
+	}
 
+	if (!faces_from_tessfaces) {
+		DM_DupPolys(source, dm);
+	}
+	else {
 		CDDM_tessfaces_to_faces(dm);
 	}
 
@@ -2457,12 +2468,17 @@ static DerivedMesh *cddm_copy_ex(DerivedMesh *source, int faces_from_tessfaces)
 
 DerivedMesh *CDDM_copy(DerivedMesh *source)
 {
-	return cddm_copy_ex(source, 0);
+	return cddm_copy_ex(source, false, false);
 }
 
 DerivedMesh *CDDM_copy_from_tessface(DerivedMesh *source)
 {
-	return cddm_copy_ex(source, 1);
+	return cddm_copy_ex(source, false, true);
+}
+
+DerivedMesh *CDDM_copy_with_tessface(DerivedMesh *source)
+{
+	return cddm_copy_ex(source, true, false);
 }
 
 /* note, the CD_ORIGINDEX layers are all 0, so if there is a direct
diff --git a/source/blender/blenkernel/intern/cloth.c b/source/blender/blenkernel/intern/cloth.c
index 28ef3f6f248..ee0fde1ea61 100644
--- a/source/blender/blenkernel/intern/cloth.c
+++ b/source/blender/blenkernel/intern/cloth.c
@@ -869,12 +869,6 @@ static int cloth_from_object(Object *ob, ClothModifierData *clmd, DerivedMesh *d
 		return 0;
 	}
 	
-	for ( i = 0; i < dm->getNumVerts(dm); i++) {
-		if ((!(cloth->verts[i].flags & CLOTH_VERT_FLAG_PINNED)) && (cloth->verts[i].goal > ALMOST_ZERO)) {
-			cloth_add_spring (clmd, i, i, 0.0, CLOTH_SPRING_TYPE_GOAL);
-		}
-	}
-	
 	// init our solver
 	BPH_cloth_solver_init(ob, clmd);
 	
@@ -944,37 +938,6 @@ BLI_INLINE void spring_verts_ordered_set(ClothSpring *spring, int v0, int v1)
 	}
 }
 
-// be careful: implicit solver has to be resettet when using this one!
-// --> only for implicit handling of this spring!
-int cloth_add_spring(ClothModifierData *clmd, unsigned int indexA, unsigned int indexB, float restlength, int spring_type)
-{
-	Cloth *cloth = clmd->clothObject;
-	ClothSpring *spring = NULL;
-	
-	if (cloth) {
-		// TODO: look if this spring is already there
-		
-		spring = (ClothSpring *)MEM_callocN ( sizeof ( ClothSpring ), "cloth spring" );
-		
-		if (!spring)
-			return 0;
-		
-		spring->ij = indexA;
-		spring->kl = indexB;
-		spring->restlen =  restlength;
-		spring->type = spring_type;
-		spring->flags = 0;
-		spring->stiffness = 0;
-		
-		cloth->numsprings++;
-	
-		BLI_linklist_prepend ( &cloth->springs, spring );
-		
-		return 1;
-	}
-	return 0;
-}
-
 static void cloth_free_edgelist(LinkNodePair *edgelist, unsigned int mvert_num)
 {
 	if (edgelist) {
diff --git a/source/blender/blenkernel/intern/colortools.c b/source/blender/blenkernel/intern/colortools.c
index 4f3ffed41bc..b3a0895d063 100644
--- a/source/blender/blenkernel/intern/colortools.c
+++ b/source/blender/blenkernel/intern/colortools.c
@@ -508,7 +508,7 @@ static void calchandle_curvemap(
 				if ((bezt->h2 == HD_AUTO_ANIM) && next && prev) { /* keep horizontal if extrema */
 					const float ydiff1 = prev->vec[1][1] - bezt->vec[1][1];
 					const float ydiff2 = next->vec[1][1] - bezt->vec[1][1];
-					if ((ydiff1 <= 0.0f && ydiff2 <= 0.0f)||
+					if ((ydiff1 <= 0.0f && ydiff2 <= 0.0f) ||
 					    (ydiff1 >= 0.0f && ydiff2 >= 0.0f))
 					{
 						bezt->vec[2][1] = bezt->vec[1][1];
diff --git a/source/blender/blenkernel/intern/curve.c b/source/blender/blenkernel/intern/curve.c
index 90a514781d7..439abb1d593 100644
--- a/source/blender/blenkernel/intern/curve.c
+++ b/source/blender/blenkernel/intern/curve.c
@@ -89,20 +89,33 @@ void BKE_curve_editfont_free(Curve *cu)
 	}
 }
 
-void BKE_curve_editNurb_keyIndex_free(EditNurb *editnurb)
+static void curve_editNurb_keyIndex_cv_free_cb(void *val)
 {
-	if (!editnurb->keyindex) {
+	CVKeyIndex *index = val;
+	MEM_freeN(index->orig_cv);
+	MEM_freeN(val);
+}
+
+void BKE_curve_editNurb_keyIndex_delCV(GHash *keyindex, const void *cv)
+{
+	BLI_assert(keyindex != NULL);
+	BLI_ghash_remove(keyindex, cv, NULL, curve_editNurb_keyIndex_cv_free_cb);
+}
+
+void BKE_curve_editNurb_keyIndex_free(GHash **keyindex)
+{
+	if (!(*keyindex)) {
 		return;
 	}
-	BLI_ghash_free(editnurb->keyindex, NULL, MEM_freeN);
-	editnurb->keyindex = NULL;
+	BLI_ghash_free(*keyindex, NULL, curve_editNurb_keyIndex_cv_free_cb);
+	*keyindex = NULL;
 }
 
 void BKE_curve_editNurb_free(Curve *cu)
 {
 	if (cu->editnurb) {
 		BKE_nurbList_free(&cu->editnurb->nurbs);
-		BKE_curve_editNurb_keyIndex_free(cu->editnurb);
+		BKE_curve_editNurb_keyIndex_free(&cu->editnurb->keyindex);
 		MEM_freeN(cu->editnurb);
 		cu->editnurb = NULL;
 	}
diff --git a/source/blender/blenkernel/intern/customdata.c b/source/blender/blenkernel/intern/customdata.c
index 98d37fb07bf..7c3f0ac630d 100644
--- a/source/blender/blenkernel/intern/customdata.c
+++ b/source/blender/blenkernel/intern/customdata.c
@@ -33,14 +33,10 @@
 /** \file blender/blenkernel/intern/customdata.c
  *  \ingroup bke
  */
- 
-
-#include <math.h>
-#include <string.h>
-#include <assert.h>
 
 #include "MEM_guardedalloc.h"
 
+#include "DNA_customdata_types.h"
 #include "DNA_meshdata_types.h"
 #include "DNA_ID.h"
 
@@ -62,20 +58,16 @@
 #include "BKE_mesh_remap.h"
 #include "BKE_multires.h"
 
-#include "data_transfer_intern.h"
-
 #include "bmesh.h"
 
-#include <math.h>
-#include <string.h>
+/* only for customdata_data_transfer_interp_normal_normals */
+#include "data_transfer_intern.h"
 
 /* number of layers to add when growing a CustomData object */
 #define CUSTOMDATA_GROW 5
 
 /* ensure typemap size is ok */
-BLI_STATIC_ASSERT(sizeof(((CustomData *)NULL)->typemap) /
-                  sizeof(((CustomData *)NULL)->typemap[0]) == CD_NUMTYPES,
-                  "size mismatch");
+BLI_STATIC_ASSERT(ARRAY_SIZE(((CustomData *)NULL)->typemap) == CD_NUMTYPES, "size mismatch");
 
 
 /********************* Layer type information **********************/
@@ -805,18 +797,15 @@ static void layerInterp_mloopcol(
         const float *sub_weights, int count, void *dest)
 {
 	MLoopCol *mc = dest;
-	int i;
-	const float *sub_weight;
 	struct {
 		float a;
 		float r;
 		float g;
 		float b;
-	} col;
-	col.a = col.r = col.g = col.b = 0;
+	} col = {0};
 
-	sub_weight = sub_weights;
-	for (i = 0; i < count; ++i) {
+	const float *sub_weight = sub_weights;
+	for (int i = 0; i < count; ++i) {
 		float weight = weights ? weights[i] : 1;
 		const MLoopCol *src = sources[i];
 		if (sub_weights) {
@@ -833,19 +822,16 @@ static void layerInterp_mloopcol(
 			col.a += src->a * weight;
 		}
 	}
-	
+
+
 	/* Subdivide smooth or fractal can cause problems without clamping
 	 * although weights should also not cause this situation */
-	CLAMP(col.a, 0.0f, 255.0f);
-	CLAMP(col.r, 0.0f, 255.0f);
-	CLAMP(col.g, 0.0f, 255.0f);
-	CLAMP(col.b, 0.0f, 255.0f);
 
-	/* delay writing to the destination incase dest is in sources */
-	mc->r = (int)col.r;
-	mc->g = (int)col.g;
-	mc->b = (int)col.b;
-	mc->a = (int)col.a;
+	/* also delay writing to the destination incase dest is in sources */
+	mc->r = CLAMPIS(iroundf(col.r), 0, 255);
+	mc->g = CLAMPIS(iroundf(col.g), 0, 255);
+	mc->b = CLAMPIS(iroundf(col.b), 0, 255);
+	mc->a = CLAMPIS(iroundf(col.a), 0, 255);
 }
 
 static int layerMaxNum_mloopcol(void)
@@ -1068,15 +1054,10 @@ static void layerInterp_mcol(
 		
 		/* Subdivide smooth or fractal can cause problems without clamping
 		 * although weights should also not cause this situation */
-		CLAMP(col[j].a, 0.0f, 255.0f);
-		CLAMP(col[j].r, 0.0f, 255.0f);
-		CLAMP(col[j].g, 0.0f, 255.0f);
-		CLAMP(col[j].b, 0.0f, 255.0f);
-		
-		mc[j].a = (int)col[j].a;
-		mc[j].r = (int)col[j].r;
-		mc[j].g = (int)col[j].g;
-		mc[j].b = (int)col[j].b;
+		mc[j].a = CLAMPIS(iroundf(col[j].a), 0, 255);
+		mc[j].r = CLAMPIS(iroundf(col[j].r), 0, 255);
+		mc[j].g = CLAMPIS(iroundf(col[j].g), 0, 255);
+		mc[j].b = CLAMPIS(iroundf(col[j].b), 0, 255);
 	}
 }
 
@@ -2595,7 +2576,7 @@ bool CustomData_from_bmeshpoly_test(CustomData *fdata, CustomData *pdata, Custom
 	if (!LAYER_CMP(ldata, CD_TANGENT, fdata, CD_TANGENT))
 		return false;
 
-#undef TEST_RET
+#undef LAYER_CMP
 
 	/* if no layers are on either CustomData's,
 	 * then there was nothing to do... */
diff --git a/source/blender/blenkernel/intern/depsgraph.c b/source/blender/blenkernel/intern/depsgraph.c
index 294a4ce76b7..678dc92a5f2 100644
--- a/source/blender/blenkernel/intern/depsgraph.c
+++ b/source/blender/blenkernel/intern/depsgraph.c
@@ -544,10 +544,16 @@ static void build_dag_object(DagForest *dag, DagNode *scenenode, Main *bmain, Sc
 									if (ct->tar->type == OB_MESH)
 										node3->customdata_mask |= CD_MASK_MDEFORMVERT;
 								}
-								else if (ELEM(con->type, CONSTRAINT_TYPE_FOLLOWPATH, CONSTRAINT_TYPE_CLAMPTO, CONSTRAINT_TYPE_SPLINEIK))
+								else if (ELEM(con->type, CONSTRAINT_TYPE_FOLLOWPATH,
+								                         CONSTRAINT_TYPE_CLAMPTO,
+								                         CONSTRAINT_TYPE_SPLINEIK,
+								                         CONSTRAINT_TYPE_SHRINKWRAP))
+								{
 									dag_add_relation(dag, node3, node, DAG_RL_DATA_DATA | DAG_RL_OB_DATA, cti->name);
-								else
+								}
+								else {
 									dag_add_relation(dag, node3, node, DAG_RL_OB_DATA, cti->name);
+								}
 							}
 						}
 						
@@ -881,8 +887,12 @@ static void build_dag_object(DagForest *dag, DagNode *scenenode, Main *bmain, Sc
 						if (obt->type == OB_MESH)
 							node2->customdata_mask |= CD_MASK_MDEFORMVERT;
 					}
-					else
+					else if (cti->type == CONSTRAINT_TYPE_SHRINKWRAP) {
+						dag_add_relation(dag, node2, node, DAG_RL_DATA_DATA | DAG_RL_OB_DATA, cti->name);
+					}
+					else {
 						dag_add_relation(dag, node2, node, DAG_RL_OB_OB, cti->name);
+					}
 				}
 				addtoroot = 0;
 			}
diff --git a/source/blender/blenkernel/intern/displist.c b/source/blender/blenkernel/intern/displist.c
index 49db75a0474..f8a9d57f579 100644
--- a/source/blender/blenkernel/intern/displist.c
+++ b/source/blender/blenkernel/intern/displist.c
@@ -819,7 +819,7 @@ static void curve_calc_modifiers_pre(Scene *scene, Object *ob, ListBase *nurb,
 	if (editmode)
 		required_mode |= eModifierMode_Editmode;
 
-	if (cu->editnurb == NULL) {
+	if (!editmode) {
 		keyVerts = BKE_key_evaluate_object(ob, &numVerts);
 
 		if (keyVerts) {
diff --git a/source/blender/blenkernel/intern/dynamicpaint.c b/source/blender/blenkernel/intern/dynamicpaint.c
index dc9f3b57f1f..c525e75b825 100644
--- a/source/blender/blenkernel/intern/dynamicpaint.c
+++ b/source/blender/blenkernel/intern/dynamicpaint.c
@@ -2460,8 +2460,7 @@ static void dynamic_paint_find_island_border(
 			const unsigned int *other_loop_idx = mlooptri[lt_index].tri;
 
 			/* Check edges for match, looping in the same order as the outer loop. */
-			for (int j = 0; j < 3; j++)
-			{
+			for (int j = 0; j < 3; j++) {
 				const int overt0 = mloop[other_loop_idx[(j + 0)]].v;
 				const int overt1 = mloop[other_loop_idx[(j + 1) % 3]].v;
 
@@ -2525,8 +2524,7 @@ static void dynamic_paint_find_island_border(
 		int final_pixel[2] = { (int)floorf(tgt_pixel[0] * w), (int)floorf(tgt_pixel[1] * h) };
 
 		/* If current pixel uv is outside of texture	*/
-		if (final_pixel[0] < 0 || final_pixel[0] >= w || final_pixel[1] < 0 || final_pixel[1] >= h)
-		{
+		if (final_pixel[0] < 0 || final_pixel[0] >= w || final_pixel[1] < 0 || final_pixel[1] >= h) {
 			if (bdata->best_index == NOT_FOUND)
 				bdata->best_index = OUT_OF_TEXTURE;
 
diff --git a/source/blender/blenkernel/intern/editderivedmesh.c b/source/blender/blenkernel/intern/editderivedmesh.c
index e7c0e69b1cb..2fffa0dea28 100644
--- a/source/blender/blenkernel/intern/editderivedmesh.c
+++ b/source/blender/blenkernel/intern/editderivedmesh.c
@@ -642,10 +642,10 @@ static void emDM_recalcLoopTri(DerivedMesh *dm)
 		MLoopTri *lt = &mlooptri[i];
 
 		ARRAY_SET_ITEMS(
-				lt->tri,
-				BM_elem_index_get(ltri[0]),
-				BM_elem_index_get(ltri[1]),
-				BM_elem_index_get(ltri[2]));
+		        lt->tri,
+		        BM_elem_index_get(ltri[0]),
+		        BM_elem_index_get(ltri[1]),
+		        BM_elem_index_get(ltri[2]));
 		lt->poly = BM_elem_index_get(ltri[0]->f);
 	}
 }
diff --git a/source/blender/blenkernel/intern/effect.c b/source/blender/blenkernel/intern/effect.c
index fe8f5ebdca6..4eee24b378f 100644
--- a/source/blender/blenkernel/intern/effect.c
+++ b/source/blender/blenkernel/intern/effect.c
@@ -848,6 +848,14 @@ static void do_physical_effector(EffectorCache *eff, EffectorData *efd, Effected
 			break;
 		case PFIELD_FORCE:
 			normalize_v3(force);
+			if (pd->flag & PFIELD_GRAVITATION){ /* Option: Multiply by 1/distance^2 */
+				if (efd->distance < FLT_EPSILON){
+					strength = 0.0f;
+				}
+				else {
+					strength *= powf(efd->distance, -2.0f);
+				}
+			}
 			mul_v3_fl(force, strength * efd->falloff);
 			break;
 		case PFIELD_VORTEX:
diff --git a/source/blender/blenkernel/intern/fcurve.c b/source/blender/blenkernel/intern/fcurve.c
index c67a61a5aad..7dbc43e0a32 100644
--- a/source/blender/blenkernel/intern/fcurve.c
+++ b/source/blender/blenkernel/intern/fcurve.c
@@ -1217,6 +1217,7 @@ bool driver_get_variable_property(
 	return true;
 }
 
+#if 0
 /* Helper function to obtain a pointer to a Pose Channel (for evaluating drivers) */
 static bPoseChannel *dtar_get_pchan_ptr(ChannelDriver *driver, DriverTarget *dtar)
 {
@@ -1239,6 +1240,32 @@ static bPoseChannel *dtar_get_pchan_ptr(ChannelDriver *driver, DriverTarget *dta
 		return NULL;
 	}
 }
+#endif
+
+static short driver_check_valid_targets(ChannelDriver *driver, DriverVar *dvar)
+{
+	short valid_targets = 0;
+
+	DRIVER_TARGETS_USED_LOOPER(dvar)
+	{
+		Object *ob = (Object *)dtar_id_ensure_proxy_from(dtar->id);
+
+		/* check if this target has valid data */
+		if ((ob == NULL) || (GS(ob->id.name) != ID_OB)) {
+			/* invalid target, so will not have enough targets */
+			driver->flag |= DRIVER_FLAG_INVALID;
+			dtar->flag |= DTAR_FLAG_INVALID;
+		}
+		else {
+			/* target seems to be OK now... */
+			dtar->flag &= ~DTAR_FLAG_INVALID;
+			valid_targets++;
+		}
+	}
+	DRIVER_TARGETS_LOOPER_END
+
+	return valid_targets;
+}
 
 /* ......... */
 
@@ -1252,62 +1279,54 @@ static float dvar_eval_singleProp(ChannelDriver *driver, DriverVar *dvar)
 /* evaluate 'rotation difference' driver variable */
 static float dvar_eval_rotDiff(ChannelDriver *driver, DriverVar *dvar)
 {
-	DriverTarget *dtar1 = &dvar->targets[0];
-	DriverTarget *dtar2 = &dvar->targets[1];
-	bPoseChannel *pchan, *pchan2;
-	float q1[4], q2[4], quat[4], angle;
-	
-	/* get pose channels, and check if we've got two */
-	pchan  = dtar_get_pchan_ptr(driver, dtar1);
-	pchan2 = dtar_get_pchan_ptr(driver, dtar2);
-	
-	if (ELEM(NULL, pchan, pchan2)) {
-		/* disable this driver, since it doesn't work correctly... */
-		driver->flag |= DRIVER_FLAG_INVALID;
-		
-		/* check what the error was */
-		if ((pchan == NULL) && (pchan2 == NULL)) {
-			if (G.debug & G_DEBUG) {
-				printf("Driver Evaluation Error: Rotational difference failed - first 2 targets invalid\n");
-			}
-			
-			dtar1->flag |= DTAR_FLAG_INVALID;
-			dtar2->flag |= DTAR_FLAG_INVALID;
-		}
-		else if (pchan == NULL) {
-			if (G.debug & G_DEBUG) {
-				printf("Driver Evaluation Error: Rotational difference failed - first target not valid PoseChannel\n");
-			}
-			
-			dtar1->flag |=  DTAR_FLAG_INVALID;
-			dtar2->flag &= ~DTAR_FLAG_INVALID;
-		}
-		else if (pchan2 == NULL) {
-			if (G.debug & G_DEBUG) {
-				printf("Driver Evaluation Error: Rotational difference failed - second target not valid PoseChannel\n");
-			}
-			
-			dtar1->flag &= ~DTAR_FLAG_INVALID;
-			dtar2->flag |=  DTAR_FLAG_INVALID;
+	short valid_targets = driver_check_valid_targets(driver, dvar);
+
+	/* make sure we have enough valid targets to use - all or nothing for now... */
+	if (driver_check_valid_targets(driver, dvar) != 2) {
+		if (G.debug & G_DEBUG) {
+			printf("RotDiff DVar: not enough valid targets (n = %d) (a = %p, b = %p)\n",
+			        valid_targets, dvar->targets[0].id, dvar->targets[1].id);
 		}
-		
-		/* stop here... */
 		return 0.0f;
 	}
-	else {
-		dtar1->flag &= ~DTAR_FLAG_INVALID;
-		dtar2->flag &= ~DTAR_FLAG_INVALID;
+
+	float (*mat[2])[4];
+
+	/* NOTE: for now, these are all just worldspace */
+	for (int i = 0; i < 2; i++) {
+		/* get pointer to loc values to store in */
+		DriverTarget *dtar = &dvar->targets[i];
+		Object *ob = (Object *)dtar_id_ensure_proxy_from(dtar->id);
+		bPoseChannel *pchan;
+
+		/* after the checks above, the targets should be valid here... */
+		BLI_assert((ob != NULL) && (GS(ob->id.name) == ID_OB));
+
+		/* try to get posechannel */
+		pchan = BKE_pose_channel_find_name(ob->pose, dtar->pchan_name);
+
+		/* check if object or bone */
+		if (pchan) {
+			/* bone */
+			mat[i] = pchan->pose_mat;
+		}
+		else {
+			/* object */
+			mat[i] = ob->obmat;
+		}
 	}
-	
+
+	float q1[4], q2[4], quat[4], angle;
+
 	/* use the final posed locations */
-	mat4_to_quat(q1, pchan->pose_mat);
-	mat4_to_quat(q2, pchan2->pose_mat);
-	
+	mat4_to_quat(q1, mat[0]);
+	mat4_to_quat(q2, mat[1]);
+
 	invert_qt_normalized(q1);
 	mul_qt_qtqt(quat, q1, q2);
 	angle = 2.0f * (saacos(quat[0]));
 	angle = ABS(angle);
-	
+
 	return (angle > (float)M_PI) ? (float)((2.0f * (float)M_PI) - angle) : (float)(angle);
 }
 
@@ -1317,32 +1336,8 @@ static float dvar_eval_locDiff(ChannelDriver *driver, DriverVar *dvar)
 {
 	float loc1[3] = {0.0f, 0.0f, 0.0f};
 	float loc2[3] = {0.0f, 0.0f, 0.0f};
-	short valid_targets = 0;
-	
-	/* Perform two passes
-	 *
-	 * FIRST PASS - to just check that everything works... 
-	 * NOTE: we use loops here to reduce code duplication, though in practice, 
-	 *       there can only be 2 items or else we run into some problems later
-	 */
-	DRIVER_TARGETS_USED_LOOPER(dvar)
-	{
-		Object *ob = (Object *)dtar_id_ensure_proxy_from(dtar->id);
-		
-		/* check if this target has valid data */
-		if ((ob == NULL) || (GS(ob->id.name) != ID_OB)) {
-			/* invalid target, so will not have enough targets */
-			driver->flag |= DRIVER_FLAG_INVALID;
-			dtar->flag   |= DTAR_FLAG_INVALID;
-		}
-		else {
-			/* target seems to be OK now... */
-			dtar->flag &= ~DTAR_FLAG_INVALID;
-			valid_targets++;
-		}
-	}
-	DRIVER_TARGETS_LOOPER_END
-	
+	short valid_targets = driver_check_valid_targets(driver, dvar);
+
 	/* make sure we have enough valid targets to use - all or nothing for now... */
 	if (valid_targets < dvar->num_targets) {
 		if (G.debug & G_DEBUG) {
@@ -1351,8 +1346,7 @@ static float dvar_eval_locDiff(ChannelDriver *driver, DriverVar *dvar)
 		}
 		return 0.0f;
 	}
-	
-	
+
 	/* SECOND PASS: get two location values */
 	/* NOTE: for now, these are all just worldspace */
 	DRIVER_TARGETS_USED_LOOPER(dvar)
@@ -1570,7 +1564,7 @@ static DriverVarTypeInfo dvar_types[MAX_DVAR_TYPES] = {
 	BEGIN_DVAR_TYPEDEF(DVAR_TYPE_ROT_DIFF)
 		dvar_eval_rotDiff,     /* eval callback */
 		2,     /* number of targets used */
-		{"Bone 1", "Bone 2"},     /* UI names for targets */
+		{"Object/Bone 1", "Object/Bone 2"},     /* UI names for targets */
 		{DTAR_FLAG_STRUCT_REF | DTAR_FLAG_ID_OB_ONLY, DTAR_FLAG_STRUCT_REF | DTAR_FLAG_ID_OB_ONLY} /* flags */
 	END_DVAR_TYPEDEF,
 	
diff --git a/source/blender/blenkernel/intern/idprop.c b/source/blender/blenkernel/intern/idprop.c
index b2641b110f8..a596869e5e8 100644
--- a/source/blender/blenkernel/intern/idprop.c
+++ b/source/blender/blenkernel/intern/idprop.c
@@ -130,18 +130,22 @@ void IDP_FreeIDPArray(IDProperty *prop)
 		MEM_freeN(prop->data.pointer);
 }
 
-/*shallow copies item*/
+/* shallow copies item */
 void IDP_SetIndexArray(IDProperty *prop, int index, IDProperty *item)
 {
 	IDProperty *old;
 
 	BLI_assert(prop->type == IDP_IDPARRAY);
 
+	if (index >= prop->len || index < 0)
+		return;
+
 	old = GETPROP(prop, index);
-	if (index >= prop->len || index < 0) return;
-	if (item != old) IDP_FreeProperty(old);
-	
-	memcpy(GETPROP(prop, index), item, sizeof(IDProperty));
+	if (item != old) {
+		IDP_FreeProperty(old);
+
+		memcpy(old, item, sizeof(IDProperty));
+	}
 }
 
 IDProperty *IDP_GetIndexArray(IDProperty *prop, int index)
diff --git a/source/blender/blenkernel/intern/image.c b/source/blender/blenkernel/intern/image.c
index a2d94ccc478..318f6480aaf 100644
--- a/source/blender/blenkernel/intern/image.c
+++ b/source/blender/blenkernel/intern/image.c
@@ -3159,7 +3159,7 @@ static ImBuf *load_sequence_single(Image *ima, ImageUser *iuser, int frame, cons
 	struct ImBuf *ibuf;
 	char name[FILE_MAX];
 	int flag;
-	ImageUser iuser_t;
+	ImageUser iuser_t = {0};
 
 	/* XXX temp stuff? */
 	if (ima->lastframe != frame)
@@ -3167,8 +3167,12 @@ static ImBuf *load_sequence_single(Image *ima, ImageUser *iuser, int frame, cons
 
 	ima->lastframe = frame;
 
-	if (iuser)
+	if (iuser) {
 		iuser_t = *iuser;
+	}
+	else {
+		/* TODO(sergey): Do we need to initialize something here? */
+	}
 
 	iuser_t.view = view_id;
 	BKE_image_user_file_path(&iuser_t, ima, name);
diff --git a/source/blender/blenkernel/intern/lattice.c b/source/blender/blenkernel/intern/lattice.c
index b0671f33094..54cfae620a1 100644
--- a/source/blender/blenkernel/intern/lattice.c
+++ b/source/blender/blenkernel/intern/lattice.c
@@ -67,7 +67,7 @@
 
 #include "BKE_deform.h"
 
-/* Workaround for cyclic depenndnecy with curves.
+/* Workaround for cyclic dependency with curves.
  * In such case curve_cache might not be ready yet,
  */
 #define CYCLIC_DEPENDENCY_WORKAROUND
diff --git a/source/blender/blenkernel/intern/library.c b/source/blender/blenkernel/intern/library.c
index 6b4b492b212..e380b5c17e0 100644
--- a/source/blender/blenkernel/intern/library.c
+++ b/source/blender/blenkernel/intern/library.c
@@ -1863,7 +1863,7 @@ void BKE_library_make_local(
 				/* Special case for objects because we don't want proxy pointers to be
 				 * cleared yet. This will happen down the road in this function.
 				 */
-				BKE_object_make_local_ex(bmain, (Object*)id, true, false);
+				BKE_object_make_local_ex(bmain, (Object *)id, true, false);
 			}
 			else {
 				id_make_local(bmain, id, false, true);
diff --git a/source/blender/blenkernel/intern/library_remap.c b/source/blender/blenkernel/intern/library_remap.c
index a408b498f18..b6f4621a0b3 100644
--- a/source/blender/blenkernel/intern/library_remap.c
+++ b/source/blender/blenkernel/intern/library_remap.c
@@ -179,6 +179,7 @@ static int foreach_libblock_remap_callback(void *user_data, ID *id_self, ID **id
 		 *       on the other hand since they get reset to lib data on file open/reload it is indirect too...
 		 *       Edit Mode is also a 'skip direct' case. */
 		const bool is_obj = (GS(id->name) == ID_OB);
+		const bool is_obj_proxy = (is_obj && (((Object *)id)->proxy || ((Object *)id)->proxy_group));
 		const bool is_obj_editmode = (is_obj && BKE_object_is_in_editmode((Object *)id));
 		const bool is_never_null = ((cb_flag & IDWALK_CB_NEVER_NULL) && (new_id == NULL) &&
 		                            (id_remap_data->flag & ID_REMAP_FORCE_NEVER_NULL_USAGE) == 0);
@@ -231,7 +232,7 @@ static int foreach_libblock_remap_callback(void *user_data, ID *id_self, ID **id
 				/* We cannot affect old_id->us directly, LIB_TAG_EXTRAUSER(_SET) are assumed to be set as needed,
 				 * that extra user is processed in final handling... */
 			}
-			if (!is_indirect) {
+			if (!is_indirect || is_obj_proxy) {
 				id_remap_data->status |= ID_REMAP_IS_LINKED_DIRECT;
 			}
 		}
diff --git a/source/blender/blenkernel/intern/mesh.c b/source/blender/blenkernel/intern/mesh.c
index af02e02b017..c12890a354e 100644
--- a/source/blender/blenkernel/intern/mesh.c
+++ b/source/blender/blenkernel/intern/mesh.c
@@ -39,7 +39,9 @@
 
 #include "BLI_utildefines.h"
 #include "BLI_math.h"
+#include "BLI_linklist.h"
 #include "BLI_listbase.h"
+#include "BLI_memarena.h"
 #include "BLI_edgehash.h"
 #include "BLI_string.h"
 
@@ -66,6 +68,11 @@
 
 #include "DEG_depsgraph.h"
 
+/* Define for cases when you want extra validation of mesh
+ * after certain modifications.
+ */
+// #undef VALIDATE_MESH
+
 enum {
 	MESHCMP_DVERT_WEIGHTMISMATCH = 1,
 	MESHCMP_DVERT_GROUPMISMATCH,
@@ -2048,7 +2055,7 @@ void BKE_mesh_mselect_active_set(Mesh *me, int index, int type)
 	           (me->mselect[me->totselect - 1].type  == type));
 }
 
-void BKE_mesh_calc_normals_split(Mesh *mesh)
+void BKE_mesh_calc_normals_split_ex(Mesh *mesh, MLoopNorSpaceArray *r_lnors_spacearr)
 {
 	float (*r_loopnors)[3];
 	float (*polynors)[3];
@@ -2083,113 +2090,339 @@ void BKE_mesh_calc_normals_split(Mesh *mesh)
 	BKE_mesh_normals_loop_split(
 	        mesh->mvert, mesh->totvert, mesh->medge, mesh->totedge,
 	        mesh->mloop, r_loopnors, mesh->totloop, mesh->mpoly, (const float (*)[3])polynors, mesh->totpoly,
-	        (mesh->flag & ME_AUTOSMOOTH) != 0, mesh->smoothresh, NULL, clnors, NULL);
+	        (mesh->flag & ME_AUTOSMOOTH) != 0, mesh->smoothresh, r_lnors_spacearr, clnors, NULL);
 
 	if (free_polynors) {
 		MEM_freeN(polynors);
 	}
 }
 
-/* Spli faces based on the edge angle.
- * Matches behavior of face splitting in render engines.
- */
-void BKE_mesh_split_faces(Mesh *mesh)
+void BKE_mesh_calc_normals_split(Mesh *mesh)
 {
-	const int num_verts = mesh->totvert;
-	const int num_edges = mesh->totedge;
-	const int num_polys = mesh->totpoly;
+	BKE_mesh_calc_normals_split_ex(mesh, NULL);
+}
+
+/* Split faces helper functions. */
+
+typedef struct SplitFaceNewVert {
+	struct SplitFaceNewVert *next;
+	int new_index;
+	int orig_index;
+	float *vnor;
+} SplitFaceNewVert;
+
+typedef struct SplitFaceNewEdge {
+	struct SplitFaceNewEdge *next;
+	int new_index;
+	int orig_index;
+	int v1;
+	int v2;
+} SplitFaceNewEdge;
+
+/* Detect needed new vertices, and update accordingly loops' vertex indices.
+ * WARNING! Leaves mesh in invalid state. */
+static int split_faces_prepare_new_verts(
+        const Mesh *mesh, MLoopNorSpaceArray *lnors_spacearr, SplitFaceNewVert **new_verts, MemArena *memarena,
+        bool *r_need_vnors_recalc)
+{
+	/* Note: if lnors_spacearr is NULL, ther is no autosmooth handling, and we only split out flat polys. */
+	const int num_loops = mesh->totloop;
+	int num_verts = mesh->totvert;
 	MVert *mvert = mesh->mvert;
-	MEdge *medge = mesh->medge;
 	MLoop *mloop = mesh->mloop;
-	MPoly *mpoly = mesh->mpoly;
-	float (*lnors)[3];
-	int poly, num_new_verts = 0;
-	if ((mesh->flag & ME_AUTOSMOOTH) == 0) {
-		return;
-	}
-	BKE_mesh_tessface_clear(mesh);
-	/* Compute loop normals if needed. */
-	if (!CustomData_has_layer(&mesh->ldata, CD_NORMAL)) {
-		BKE_mesh_calc_normals_split(mesh);
-	}
-	lnors = CustomData_get_layer(&mesh->ldata, CD_NORMAL);
-	/* Count number of vertices to be split. */
-	for (poly = 0; poly < num_polys; poly++) {
-		MPoly *mp = &mpoly[poly];
-		int loop;
-		for (loop = 0; loop < mp->totloop; loop++) {
-			MLoop *ml = &mloop[mp->loopstart + loop];
-			MVert *mv = &mvert[ml->v];
-			float vn[3];
-			normal_short_to_float_v3(vn, mv->no);
-			if (!equals_v3v3(vn, lnors[mp->loopstart + loop])) {
-				num_new_verts++;
+
+	BLI_bitmap *verts_used = BLI_BITMAP_NEW(num_verts, __func__);
+
+	if (lnors_spacearr) {
+		BLI_bitmap *done_loops = BLI_BITMAP_NEW(num_loops, __func__);
+
+		MLoop *ml = mloop;
+		MLoopNorSpace **lnor_space = lnors_spacearr->lspacearr;
+		for (int loop_idx = 0; loop_idx < num_loops; loop_idx++, ml++, lnor_space++) {
+			if (!BLI_BITMAP_TEST(done_loops, loop_idx)) {
+				const int vert_idx = ml->v;
+				const bool vert_used = BLI_BITMAP_TEST_BOOL(verts_used, vert_idx);
+				/* If vert is already used by another smooth fan, we need a new vert for this one. */
+				const int new_vert_idx = vert_used ? num_verts++ : vert_idx;
+
+				BLI_assert(*lnor_space);
+
+				if ((*lnor_space)->loops) {
+					for (LinkNode *lnode = (*lnor_space)->loops; lnode; lnode = lnode->next) {
+						const int ml_fan_idx = GET_INT_FROM_POINTER(lnode->link);
+						BLI_BITMAP_ENABLE(done_loops, ml_fan_idx);
+						if (vert_used) {
+							mloop[ml_fan_idx].v = new_vert_idx;
+						}
+					}
+				}
+				else {
+					/* Single loop in this fan... */
+					BLI_BITMAP_ENABLE(done_loops, loop_idx);
+					if (vert_used) {
+						ml->v = new_vert_idx;
+					}
+				}
+
+				if (!vert_used) {
+					BLI_BITMAP_ENABLE(verts_used, vert_idx);
+					/* We need to update that vertex's normal here, we won't go over it again. */
+					/* This is important! *DO NOT* set vnor to final computed lnor, vnor should always be defined to
+					 * 'automatic normal' value computed from its polys, not some custom normal.
+					 * Fortunately, that's the loop normal space's 'lnor' reference vector. ;) */
+					normal_float_to_short_v3(mvert[vert_idx].no, (*lnor_space)->vec_lnor);
+				}
+				else {
+					/* Add new vert to list. */
+					SplitFaceNewVert *new_vert = BLI_memarena_alloc(memarena, sizeof(*new_vert));
+					new_vert->orig_index = vert_idx;
+					new_vert->new_index = new_vert_idx;
+					new_vert->vnor = (*lnor_space)->vec_lnor;  /* See note above. */
+					new_vert->next = *new_verts;
+					*new_verts = new_vert;
+				}
 			}
 		}
+
+		MEM_freeN(done_loops);
 	}
-	if (num_new_verts == 0) {
-		/* No new vertices are to be added, can do early exit. */
-		return;
-	}
-	/* Reallocate all vert and edge related data. */
-	mesh->totvert += num_new_verts;
-	mesh->totedge += 2 * num_new_verts;
-	CustomData_realloc(&mesh->vdata, mesh->totvert);
-	CustomData_realloc(&mesh->edata, mesh->totedge);
-	/* Update pointers to a newly allocated memory. */
-	BKE_mesh_update_customdata_pointers(mesh, false);
-	mvert = mesh->mvert;
-	medge = mesh->medge;
-	/* Perform actual vertex split. */
-	num_new_verts = 0;
-	for (poly = 0; poly < num_polys; poly++) {
-		MPoly *mp = &mpoly[poly];
-		int loop;
-		for (loop = 0; loop < mp->totloop; loop++) {
-			int poly_loop = mp->loopstart + loop;
-			MLoop *ml = &mloop[poly_loop];
-			MVert *mv = &mvert[ml->v];
-			float vn[3];
-			normal_short_to_float_v3(vn, mv->no);
-			if (!equals_v3v3(vn, lnors[mp->loopstart + loop])) {
-				int poly_loop_prev = mp->loopstart + (loop + mp->totloop - 1) % mp->totloop;
-				MLoop *ml_prev = &mloop[poly_loop_prev];
-				int new_edge_prev, new_edge;
-				/* Cretae new vertex. */
-				int new_vert = num_verts + num_new_verts;
-				CustomData_copy_data(&mesh->vdata, &mesh->vdata,
-				                     ml->v, new_vert, 1);
-				normal_float_to_short_v3(mvert[new_vert].no,
-				                         lnors[poly_loop]);
-				/* Create new edges. */
-				new_edge_prev = num_edges + 2 * num_new_verts;
-				new_edge = num_edges + 2 * num_new_verts + 1;
-				CustomData_copy_data(&mesh->edata, &mesh->edata,
-				                     ml_prev->e, new_edge_prev, 1);
-				CustomData_copy_data(&mesh->edata, &mesh->edata,
-				                     ml->e, new_edge, 1);
-				if (medge[new_edge_prev].v1 == ml->v) {
-					medge[new_edge_prev].v1 = new_vert;
+	else {
+		/* No loop normal spaces available, we only split out flat polys. */
+		const int num_polys = mesh->totpoly;
+		const MPoly *mpoly = mesh->mpoly;
+
+		/* We do that in two loops, to keep original edges/verts to smooth polys preferencially. */
+		const MPoly *mp = mpoly;
+		for (int i = 0; i < num_polys; i++, mp++) {
+			if (mp->flag & ME_SMOOTH) {
+				const MLoop *ml = &mloop[mp->loopstart];
+				for (int j = 0; j < mp->totloop; j++, ml++) {
+					/* Just mark the vertex as used/reserved, that way neighbor flat polys, if any,
+					 * will have to create their own. */
+					BLI_BITMAP_ENABLE(verts_used, ml->v);
 				}
-				else {
-					medge[new_edge_prev].v2 = new_vert;
+			}
+		}
+
+		mp = mpoly;
+		for (int i = 0; i < num_polys; i++, mp++) {
+			if (!(mp->flag & ME_SMOOTH)) {
+				MLoop *ml = &mloop[mp->loopstart];
+				for (int j = 0; j < mp->totloop; j++, ml++) {
+					const int vert_idx = ml->v;
+
+					if (BLI_BITMAP_TEST(verts_used, vert_idx)) {
+						/* Add new vert to list. */
+						const int new_vert_idx = num_verts++;
+						ml->v = new_vert_idx;
+
+						SplitFaceNewVert *new_vert = BLI_memarena_alloc(memarena, sizeof(*new_vert));
+						new_vert->orig_index = vert_idx;
+						new_vert->new_index = new_vert_idx;
+						new_vert->vnor = NULL;  /* See note below about normals. */
+						new_vert->next = *new_verts;
+						*new_verts = new_vert;
+					}
+					else {
+						BLI_BITMAP_ENABLE(verts_used, vert_idx);
+					}
 				}
-				if (medge[new_edge].v1 == ml->v) {
-					medge[new_edge].v1 = new_vert;
+				/* Note: there is no way to get new normals for smooth vertices here (and we don't have direct access
+				 * to poly normals either for flat ones), so we'll have to recompute all vnors at the end... */
+				*r_need_vnors_recalc = true;
+			}
+		}
+	}
+
+	MEM_freeN(verts_used);
+
+	return num_verts - mesh->totvert;
+}
+
+/* Detect needed new edges, and update accordingly loops' edge indices.
+ * WARNING! Leaves mesh in invalid state. */
+static int split_faces_prepare_new_edges(
+        const Mesh *mesh, SplitFaceNewEdge **new_edges, MemArena *memarena)
+{
+	const int num_polys = mesh->totpoly;
+	int num_edges = mesh->totedge;
+	MEdge *medge = mesh->medge;
+	MLoop *mloop = mesh->mloop;
+	const MPoly *mpoly = mesh->mpoly;
+
+	BLI_bitmap *edges_used = BLI_BITMAP_NEW(num_edges, __func__);
+	EdgeHash *edges_hash = BLI_edgehash_new_ex(__func__, num_edges);
+
+	const MPoly *mp = mpoly;
+	for (int poly_idx = 0; poly_idx < num_polys; poly_idx++, mp++) {
+		MLoop *ml_prev = &mloop[mp->loopstart + mp->totloop - 1];
+		MLoop *ml = &mloop[mp->loopstart];
+		for (int loop_idx = 0; loop_idx < mp->totloop; loop_idx++, ml++) {
+			void **eval;
+			if (!BLI_edgehash_ensure_p(edges_hash, ml_prev->v, ml->v, &eval)) {
+				const int edge_idx = ml_prev->e;
+
+				/* That edge has not been encountered yet, define it. */
+				if (BLI_BITMAP_TEST(edges_used, edge_idx)) {
+					/* Original edge has already been used, we need to define a new one. */
+					const int new_edge_idx = num_edges++;
+					*eval = SET_INT_IN_POINTER(new_edge_idx);
+					ml_prev->e = new_edge_idx;
+
+					SplitFaceNewEdge *new_edge = BLI_memarena_alloc(memarena, sizeof(*new_edge));
+					new_edge->orig_index = edge_idx;
+					new_edge->new_index = new_edge_idx;
+					new_edge->v1 = ml_prev->v;
+					new_edge->v2 = ml->v;
+					new_edge->next = *new_edges;
+					*new_edges = new_edge;
 				}
 				else {
-					medge[new_edge].v2 = new_vert;
+					/* We can re-use original edge. */
+					medge[edge_idx].v1 = ml_prev->v;
+					medge[edge_idx].v2 = ml->v;
+					*eval = SET_INT_IN_POINTER(edge_idx);
+					BLI_BITMAP_ENABLE(edges_used, edge_idx);
 				}
-
-				ml->v = new_vert;
-				ml_prev->e = new_edge_prev;
-				ml->e = new_edge;
-				num_new_verts++;
 			}
+			else {
+				/* Edge already known, just update loop's edge index. */
+				ml_prev->e = GET_INT_FROM_POINTER(*eval);
+			}
+
+			ml_prev = ml;
+		}
+	}
+
+	MEM_freeN(edges_used);
+	BLI_edgehash_free(edges_hash, NULL);
+
+	return num_edges - mesh->totedge;
+}
+
+/* Perform actual split of vertices. */
+static void split_faces_split_new_verts(
+        Mesh *mesh, SplitFaceNewVert *new_verts, const int num_new_verts)
+{
+	const int num_verts = mesh->totvert - num_new_verts;
+	MVert *mvert = mesh->mvert;
+
+	/* Remember new_verts is a single linklist, so its items are in reversed order... */
+	MVert *new_mv = &mvert[mesh->totvert - 1];
+	for (int i = mesh->totvert - 1; i >= num_verts ; i--, new_mv--, new_verts = new_verts->next) {
+		BLI_assert(new_verts->new_index == i);
+		BLI_assert(new_verts->new_index != new_verts->orig_index);
+		CustomData_copy_data(&mesh->vdata, &mesh->vdata, new_verts->orig_index, i, 1);
+		if (new_verts->vnor) {
+			normal_float_to_short_v3(new_mv->no, new_verts->vnor);
 		}
 	}
 }
 
+/* Perform actual split of edges. */
+static void split_faces_split_new_edges(
+        Mesh *mesh, SplitFaceNewEdge *new_edges, const int num_new_edges)
+{
+	const int num_edges = mesh->totedge - num_new_edges;
+	MEdge *medge = mesh->medge;
+
+	/* Remember new_edges is a single linklist, so its items are in reversed order... */
+	MEdge *new_med = &medge[mesh->totedge - 1];
+	for (int i = mesh->totedge - 1; i >= num_edges ; i--, new_med--, new_edges = new_edges->next) {
+		BLI_assert(new_edges->new_index == i);
+		BLI_assert(new_edges->new_index != new_edges->orig_index);
+		CustomData_copy_data(&mesh->edata, &mesh->edata, new_edges->orig_index, i, 1);
+		new_med->v1 = new_edges->v1;
+		new_med->v2 = new_edges->v2;
+	}
+}
+
+/* Split faces based on the edge angle and loop normals.
+ * Matches behavior of face splitting in render engines.
+ *
+ * NOTE: Will leave CD_NORMAL loop data layer which is
+ * used by render engines to set shading up.
+ */
+void BKE_mesh_split_faces(Mesh *mesh, bool free_loop_normals)
+{
+	const int num_polys = mesh->totpoly;
+
+	if (num_polys == 0) {
+		return;
+	}
+	BKE_mesh_tessface_clear(mesh);
+
+	MLoopNorSpaceArray *lnors_spacearr = NULL;
+	MemArena *memarena;
+	bool need_vnors_recalc = false;
+
+	if (mesh->flag & ME_AUTOSMOOTH) {
+		lnors_spacearr = MEM_callocN(sizeof(*lnors_spacearr), __func__);
+		/* Compute loop normals and loop normal spaces (a.k.a. smooth fans of faces around vertices). */
+		BKE_mesh_calc_normals_split_ex(mesh, lnors_spacearr);
+		/* Stealing memarena from loop normals space array. */
+		memarena = lnors_spacearr->mem;
+	}
+	else {
+		/* We still have to split out flat faces... */
+		memarena = BLI_memarena_new(BLI_MEMARENA_STD_BUFSIZE, __func__);
+	}
+
+	SplitFaceNewVert *new_verts = NULL;
+	SplitFaceNewEdge *new_edges = NULL;
+
+	/* Detect loop normal spaces (a.k.a. smooth fans) that will need a new vert. */
+	const int num_new_verts = split_faces_prepare_new_verts(mesh, lnors_spacearr, &new_verts, memarena, &need_vnors_recalc);
+
+	if (num_new_verts > 0) {
+		/* Reminder: beyond this point, there is no way out, mesh is in invalid state (due to early-reassignment of
+		 * loops' vertex and edge indices to new, to-be-created split ones). */
+
+		const int num_new_edges = split_faces_prepare_new_edges(mesh, &new_edges, memarena);
+		/* We can have to split a vertex without having to add a single new edge... */
+		const bool do_edges = (num_new_edges > 0);
+
+		/* Reallocate all vert and edge related data. */
+		mesh->totvert += num_new_verts;
+		mesh->totedge += num_new_edges;
+		CustomData_realloc(&mesh->vdata, mesh->totvert);
+		if (do_edges) {
+			CustomData_realloc(&mesh->edata, mesh->totedge);
+		}
+		/* Update pointers to a newly allocated memory. */
+		BKE_mesh_update_customdata_pointers(mesh, false);
+
+		/* Perform actual split of vertices and edges. */
+		split_faces_split_new_verts(mesh, new_verts, num_new_verts);
+		if (do_edges) {
+			split_faces_split_new_edges(mesh, new_edges, num_new_edges);
+		}
+	}
+
+	/* Note: after this point mesh is expected to be valid again. */
+
+	/* CD_NORMAL is expected to be temporary only. */
+	if (free_loop_normals) {
+		CustomData_free_layers(&mesh->ldata, CD_NORMAL, mesh->totloop);
+	}
+
+	if (lnors_spacearr) {
+		/* Also frees new_verts/edges temp data, since we used its memarena to allocate them. */
+		BKE_lnor_spacearr_free(lnors_spacearr);
+		MEM_freeN(lnors_spacearr);
+	}
+	else {
+		BLI_memarena_free(memarena);
+	}
+
+	if (need_vnors_recalc) {
+		BKE_mesh_calc_normals(mesh);
+	}
+#ifdef VALIDATE_MESH
+	BKE_mesh_validate(mesh, true, true);
+#endif
+}
+
 /* settings: 1 - preview, 2 - render */
 Mesh *BKE_mesh_new_from_object(
         Main *bmain, Scene *sce, Object *ob,
diff --git a/source/blender/blenkernel/intern/mesh_evaluate.c b/source/blender/blenkernel/intern/mesh_evaluate.c
index f9eba118383..0d0055113b7 100644
--- a/source/blender/blenkernel/intern/mesh_evaluate.c
+++ b/source/blender/blenkernel/intern/mesh_evaluate.c
@@ -304,13 +304,13 @@ void BKE_mesh_calc_normals_poly(
 void BKE_mesh_calc_normals(Mesh *mesh)
 {
 #ifdef DEBUG_TIME
-	TIMEIT_START(BKE_mesh_calc_normals);
+	TIMEIT_START_AVERAGED(BKE_mesh_calc_normals);
 #endif
 	BKE_mesh_calc_normals_poly(mesh->mvert, NULL, mesh->totvert,
 	                           mesh->mloop, mesh->mpoly, mesh->totloop, mesh->totpoly,
 	                           NULL, false);
 #ifdef DEBUG_TIME
-	TIMEIT_END(BKE_mesh_calc_normals);
+	TIMEIT_END_AVERAGED(BKE_mesh_calc_normals);
 #endif
 }
 
@@ -630,7 +630,6 @@ typedef struct LoopSplitTaskDataCommon {
 	 * Note we do not need to protect it, though, since two different tasks will *always* affect different
 	 * elements in the arrays. */
 	MLoopNorSpaceArray *lnors_spacearr;
-	BLI_bitmap *sharp_verts;
 	float (*loopnors)[3];
 	short (*clnors_data)[2];
 
@@ -643,11 +642,8 @@ typedef struct LoopSplitTaskDataCommon {
 	const int *loop_to_poly;
 	const float (*polynors)[3];
 
+	int numLoops;
 	int numPolys;
-
-	/* ***** Workers communication. ***** */
-	ThreadQueue *task_queue;
-
 } LoopSplitTaskDataCommon;
 
 #define INDEX_UNSET INT_MIN
@@ -655,6 +651,50 @@ typedef struct LoopSplitTaskDataCommon {
 /* See comment about edge_to_loops below. */
 #define IS_EDGE_SHARP(_e2l) (ELEM((_e2l)[1], INDEX_UNSET, INDEX_INVALID))
 
+static void loop_manifold_fan_around_vert_next(
+        const MLoop *mloops, const MPoly *mpolys,
+        const int *loop_to_poly, const int *e2lfan_curr, const uint mv_pivot_index,
+        const MLoop **r_mlfan_curr, int *r_mlfan_curr_index, int *r_mlfan_vert_index, int *r_mpfan_curr_index)
+{
+	const MLoop *mlfan_next;
+	const MPoly *mpfan_next;
+
+	/* Warning! This is rather complex!
+	 * We have to find our next edge around the vertex (fan mode).
+	 * First we find the next loop, which is either previous or next to mlfan_curr_index, depending
+	 * whether both loops using current edge are in the same direction or not, and whether
+	 * mlfan_curr_index actually uses the vertex we are fanning around!
+	 * mlfan_curr_index is the index of mlfan_next here, and mlfan_next is not the real next one
+	 * (i.e. not the future mlfan_curr)...
+	 */
+	*r_mlfan_curr_index = (e2lfan_curr[0] == *r_mlfan_curr_index) ? e2lfan_curr[1] : e2lfan_curr[0];
+	*r_mpfan_curr_index = loop_to_poly[*r_mlfan_curr_index];
+
+	BLI_assert(*r_mlfan_curr_index >= 0);
+	BLI_assert(*r_mpfan_curr_index >= 0);
+
+	mlfan_next = &mloops[*r_mlfan_curr_index];
+	mpfan_next = &mpolys[*r_mpfan_curr_index];
+	if (((*r_mlfan_curr)->v == mlfan_next->v && (*r_mlfan_curr)->v == mv_pivot_index) ||
+	    ((*r_mlfan_curr)->v != mlfan_next->v && (*r_mlfan_curr)->v != mv_pivot_index))
+	{
+		/* We need the previous loop, but current one is our vertex's loop. */
+		*r_mlfan_vert_index = *r_mlfan_curr_index;
+		if (--(*r_mlfan_curr_index) < mpfan_next->loopstart) {
+			*r_mlfan_curr_index = mpfan_next->loopstart + mpfan_next->totloop - 1;
+		}
+	}
+	else {
+		/* We need the next loop, which is also our vertex's loop. */
+		if (++(*r_mlfan_curr_index) >= mpfan_next->loopstart + mpfan_next->totloop) {
+			*r_mlfan_curr_index = mpfan_next->loopstart;
+		}
+		*r_mlfan_vert_index = *r_mlfan_curr_index;
+	}
+	*r_mlfan_curr = &mloops[*r_mlfan_curr_index];
+	/* And now we are back in sync, mlfan_curr_index is the index of mlfan_curr! Pff! */
+}
+
 static void split_loop_nor_single_do(LoopSplitTaskDataCommon *common_data, LoopSplitTaskData *data)
 {
 	MLoopNorSpaceArray *lnors_spacearr = common_data->lnors_spacearr;
@@ -680,7 +720,7 @@ static void split_loop_nor_single_do(LoopSplitTaskDataCommon *common_data, LoopS
 	 */
 	copy_v3_v3(*lnor, polynors[mp_index]);
 
-	/* printf("BASIC: handling loop %d / edge %d / vert %d / poly %d\n", ml_curr_index, ml_curr->e, ml_curr->v, mp_index); */
+//	printf("BASIC: handling loop %d / edge %d / vert %d / poly %d\n", ml_curr_index, ml_curr->e, ml_curr->v, mp_index);
 
 	/* If needed, generate this (simple!) lnor space. */
 	if (lnors_spacearr) {
@@ -747,8 +787,7 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
 	const MEdge *me_org = &medges[ml_curr->e];  /* ml_curr would be mlfan_prev if we needed that one */
 	const int *e2lfan_curr;
 	float vec_curr[3], vec_prev[3], vec_org[3];
-	const MLoop *mlfan_curr, *mlfan_next;
-	const MPoly *mpfan_next;
+	const MLoop *mlfan_curr;
 	float lnor[3] = {0.0f, 0.0f, 0.0f};
 	/* mlfan_vert_index: the loop of our current edge might not be the loop of our current vertex! */
 	int mlfan_curr_index, mlfan_vert_index, mpfan_curr_index;
@@ -787,7 +826,7 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
 		}
 	}
 
-	/* printf("FAN: vert %d, start edge %d\n", mv_pivot_index, ml_curr->e); */
+//	printf("FAN: vert %d, start edge %d\n", mv_pivot_index, ml_curr->e);
 
 	while (true) {
 		const MEdge *me_curr = &medges[mlfan_curr->e];
@@ -803,7 +842,7 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
 			normalize_v3(vec_curr);
 		}
 
-		/* printf("\thandling edge %d / loop %d\n", mlfan_curr->e, mlfan_curr_index); */
+//		printf("\thandling edge %d / loop %d\n", mlfan_curr->e, mlfan_curr_index);
 
 		{
 			/* Code similar to accumulate_vertex_normals_poly. */
@@ -845,46 +884,16 @@ static void split_loop_nor_fan_do(LoopSplitTaskDataCommon *common_data, LoopSpli
 			/* Current edge is sharp and we have finished with this fan of faces around this vert,
 			 * or this vert is smooth, and we have completed a full turn around it.
 			 */
-			/* printf("FAN: Finished!\n"); */
+//			printf("FAN: Finished!\n");
 			break;
 		}
 
 		copy_v3_v3(vec_prev, vec_curr);
 
-		/* Warning! This is rather complex!
-		 * We have to find our next edge around the vertex (fan mode).
-		 * First we find the next loop, which is either previous or next to mlfan_curr_index, depending
-		 * whether both loops using current edge are in the same direction or not, and whether
-		 * mlfan_curr_index actually uses the vertex we are fanning around!
-		 * mlfan_curr_index is the index of mlfan_next here, and mlfan_next is not the real next one
-		 * (i.e. not the future mlfan_curr)...
-		 */
-		mlfan_curr_index = (e2lfan_curr[0] == mlfan_curr_index) ? e2lfan_curr[1] : e2lfan_curr[0];
-		mpfan_curr_index = loop_to_poly[mlfan_curr_index];
-
-		BLI_assert(mlfan_curr_index >= 0);
-		BLI_assert(mpfan_curr_index >= 0);
-
-		mlfan_next = &mloops[mlfan_curr_index];
-		mpfan_next = &mpolys[mpfan_curr_index];
-		if ((mlfan_curr->v == mlfan_next->v && mlfan_curr->v == mv_pivot_index) ||
-		    (mlfan_curr->v != mlfan_next->v && mlfan_curr->v != mv_pivot_index))
-		{
-			/* We need the previous loop, but current one is our vertex's loop. */
-			mlfan_vert_index = mlfan_curr_index;
-			if (--mlfan_curr_index < mpfan_next->loopstart) {
-				mlfan_curr_index = mpfan_next->loopstart + mpfan_next->totloop - 1;
-			}
-		}
-		else {
-			/* We need the next loop, which is also our vertex's loop. */
-			if (++mlfan_curr_index >= mpfan_next->loopstart + mpfan_next->totloop) {
-				mlfan_curr_index = mpfan_next->loopstart;
-			}
-			mlfan_vert_index = mlfan_curr_index;
-		}
-		mlfan_curr = &mloops[mlfan_curr_index];
-		/* And now we are back in sync, mlfan_curr_index is the index of mlfan_curr! Pff! */
+		/* Find next loop of the smooth fan. */
+		loop_manifold_fan_around_vert_next(
+		            mloops, mpolys, loop_to_poly, e2lfan_curr, mv_pivot_index,
+		            &mlfan_curr, &mlfan_curr_index, &mlfan_vert_index, &mpfan_curr_index);
 
 		e2lfan_curr = edge_to_loops[mlfan_curr->e];
 	}
@@ -955,31 +964,25 @@ static void loop_split_worker_do(
 	}
 }
 
-static void loop_split_worker(TaskPool * __restrict UNUSED(pool), void *taskdata, int UNUSED(threadid))
+static void loop_split_worker(TaskPool * __restrict pool, void *taskdata, int UNUSED(threadid))
 {
-	LoopSplitTaskDataCommon *common_data = taskdata;
-	LoopSplitTaskData *data_buff;
+	LoopSplitTaskDataCommon *common_data = BLI_task_pool_userdata(pool);
+	LoopSplitTaskData *data = taskdata;
 
 	/* Temp edge vectors stack, only used when computing lnor spacearr. */
 	BLI_Stack *edge_vectors = common_data->lnors_spacearr ? BLI_stack_new(sizeof(float[3]), __func__) : NULL;
 
 #ifdef DEBUG_TIME
-	TIMEIT_START(loop_split_worker);
+	TIMEIT_START_AVERAGED(loop_split_worker);
 #endif
 
-	while ((data_buff = BLI_thread_queue_pop(common_data->task_queue))) {
-		LoopSplitTaskData *data = data_buff;
-		int i;
-
-		for (i = 0; i < LOOP_SPLIT_TASK_BLOCK_SIZE; i++, data++) {
-			/* A NULL ml_curr is used to tag ended data! */
-			if (data->ml_curr == NULL) {
-				break;
-			}
-			loop_split_worker_do(common_data, data, edge_vectors);
+	for (int i = 0; i < LOOP_SPLIT_TASK_BLOCK_SIZE; i++, data++) {
+		/* A NULL ml_curr is used to tag ended data! */
+		if (data->ml_curr == NULL) {
+			break;
 		}
 
-		MEM_freeN(data_buff);
+		loop_split_worker_do(common_data, data, edge_vectors);
 	}
 
 	if (edge_vectors) {
@@ -987,49 +990,117 @@ static void loop_split_worker(TaskPool * __restrict UNUSED(pool), void *taskdata
 	}
 
 #ifdef DEBUG_TIME
-	TIMEIT_END(loop_split_worker);
+	TIMEIT_END_AVERAGED(loop_split_worker);
 #endif
 }
 
-/* Note we use data_buff to detect whether we are in threaded context or not, in later case it is NULL. */
-static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const bool threaded)
+/* Check whether gievn loop is part of an unknown-so-far cyclic smooth fan, or not.
+ * Needed because cyclic smooth fans have no obvious 'entry point', and yet we need to walk them once, and only once. */
+static bool loop_split_generator_check_cyclic_smooth_fan(
+        const MLoop *mloops, const MPoly *mpolys,
+        const int (*edge_to_loops)[2], const int *loop_to_poly, const int *e2l_prev, BLI_bitmap *skip_loops,
+        const MLoop *ml_curr, const MLoop *ml_prev, const int ml_curr_index, const int ml_prev_index,
+        const int mp_curr_index)
+{
+	const unsigned int mv_pivot_index = ml_curr->v;  /* The vertex we are "fanning" around! */
+	const int *e2lfan_curr;
+	const MLoop *mlfan_curr;
+	/* mlfan_vert_index: the loop of our current edge might not be the loop of our current vertex! */
+	int mlfan_curr_index, mlfan_vert_index, mpfan_curr_index;
+
+	e2lfan_curr = e2l_prev;
+	if (IS_EDGE_SHARP(e2lfan_curr)) {
+		/* Sharp loop, so not a cyclic smooth fan... */
+		return false;
+	}
+
+	mlfan_curr = ml_prev;
+	mlfan_curr_index = ml_prev_index;
+	mlfan_vert_index = ml_curr_index;
+	mpfan_curr_index = mp_curr_index;
+
+	BLI_assert(mlfan_curr_index >= 0);
+	BLI_assert(mlfan_vert_index >= 0);
+	BLI_assert(mpfan_curr_index >= 0);
+
+	BLI_assert(!BLI_BITMAP_TEST(skip_loops, mlfan_vert_index));
+	BLI_BITMAP_ENABLE(skip_loops, mlfan_vert_index);
+
+	while(true) {
+		/* Find next loop of the smooth fan. */
+		loop_manifold_fan_around_vert_next(
+		            mloops, mpolys, loop_to_poly, e2lfan_curr, mv_pivot_index,
+		            &mlfan_curr, &mlfan_curr_index, &mlfan_vert_index, &mpfan_curr_index);
+
+		e2lfan_curr = edge_to_loops[mlfan_curr->e];
+
+		if (IS_EDGE_SHARP(e2lfan_curr)) {
+			/* Sharp loop/edge, so not a cyclic smooth fan... */
+			return false;
+		}
+		/* Smooth loop/edge... */
+		else if (BLI_BITMAP_TEST(skip_loops, mlfan_vert_index)) {
+			if (mlfan_vert_index == ml_curr_index) {
+				/* We walked around a whole cyclic smooth fan without finding any already-processed loop, means we can
+				 * use initial ml_curr/ml_prev edge as start for this smooth fan. */
+				return true;
+			}
+			/* ... already checked in some previous looping, we can abort. */
+			return false;
+		}
+		else {
+			/* ... we can skip it in future, and keep checking the smooth fan. */
+			BLI_BITMAP_ENABLE(skip_loops, mlfan_vert_index);
+		}
+	}
+}
+
+static void loop_split_generator(TaskPool *pool, LoopSplitTaskDataCommon *common_data)
 {
 	MLoopNorSpaceArray *lnors_spacearr = common_data->lnors_spacearr;
-	BLI_bitmap *sharp_verts = common_data->sharp_verts;
 	float (*loopnors)[3] = common_data->loopnors;
 
 	const MLoop *mloops = common_data->mloops;
 	const MPoly *mpolys = common_data->mpolys;
+	const int *loop_to_poly = common_data->loop_to_poly;
 	const int (*edge_to_loops)[2] = common_data->edge_to_loops;
+	const int numLoops = common_data->numLoops;
 	const int numPolys = common_data->numPolys;
 
 	const MPoly *mp;
 	int mp_index;
 
-	LoopSplitTaskData *data, *data_buff = NULL, data_mem;
+	const MLoop *ml_curr;
+	const MLoop *ml_prev;
+	int ml_curr_index;
+	int ml_prev_index;
+
+	BLI_bitmap *skip_loops = BLI_BITMAP_NEW(numLoops, __func__);
+
+	LoopSplitTaskData *data_buff = NULL;
 	int data_idx = 0;
 
 	/* Temp edge vectors stack, only used when computing lnor spacearr (and we are not multi-threading). */
-	BLI_Stack *edge_vectors = (lnors_spacearr && !data_buff) ? BLI_stack_new(sizeof(float[3]), __func__) : NULL;
+	BLI_Stack *edge_vectors = NULL;
 
 #ifdef DEBUG_TIME
-	TIMEIT_START(loop_split_generator);
+	TIMEIT_START_AVERAGED(loop_split_generator);
 #endif
 
-	if (!threaded) {
-		memset(&data_mem, 0, sizeof(data_mem));
-		data = &data_mem;
+	if (!pool) {
+		if (lnors_spacearr) {
+			edge_vectors = BLI_stack_new(sizeof(float[3]), __func__);
+		}
 	}
 
 	/* We now know edges that can be smoothed (with their vector, and their two loops), and edges that will be hard!
 	 * Now, time to generate the normals.
 	 */
 	for (mp = mpolys, mp_index = 0; mp_index < numPolys; mp++, mp_index++) {
-		const MLoop *ml_curr, *ml_prev;
 		float (*lnors)[3];
 		const int ml_last_index = (mp->loopstart + mp->totloop) - 1;
-		int ml_curr_index = mp->loopstart;
-		int ml_prev_index = ml_last_index;
+		ml_curr_index = mp->loopstart;
+		ml_prev_index = ml_last_index;
 
 		ml_curr = &mloops[ml_curr_index];
 		ml_prev = &mloops[ml_prev_index];
@@ -1039,23 +1110,40 @@ static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const
 			const int *e2l_curr = edge_to_loops[ml_curr->e];
 			const int *e2l_prev = edge_to_loops[ml_prev->e];
 
-			if (!IS_EDGE_SHARP(e2l_curr) && (!lnors_spacearr || BLI_BITMAP_TEST_BOOL(sharp_verts, ml_curr->v))) {
-				/* A smooth edge, and we are not generating lnor_spacearr, or the related vertex is sharp.
-				 * We skip it because it is either:
-				 * - in the middle of a 'smooth fan' already computed (or that will be as soon as we hit
-				 *   one of its ends, i.e. one of its two sharp edges), or...
-				 * - the related vertex is a "full smooth" one, in which case pre-populated normals from vertex
-				 *   are just fine (or it has already be handled in a previous loop in case of needed lnors spacearr)!
-				 */
-				/* printf("Skipping loop %d / edge %d / vert %d(%d)\n", ml_curr_index, ml_curr->e, ml_curr->v, sharp_verts[ml_curr->v]); */
+//			printf("Checking loop %d / edge %u / vert %u (sharp edge: %d, skiploop: %d)...",
+//			       ml_curr_index, ml_curr->e, ml_curr->v, IS_EDGE_SHARP(e2l_curr), BLI_BITMAP_TEST_BOOL(skip_loops, ml_curr_index));
+
+			/* A smooth edge, we have to check for cyclic smooth fan case.
+			 * If we find a new, never-processed cyclic smooth fan, we can do it now using that loop/edge as
+			 * 'entry point', otherwise we can skip it. */
+			/* Note: In theory, we could make loop_split_generator_check_cyclic_smooth_fan() store
+			 * mlfan_vert_index'es and edge indexes in two stacks, to avoid having to fan again around the vert during
+			 * actual computation of clnor & clnorspace. However, this would complicate the code, add more memory usage,
+			 * and despite its logical complexity, loop_manifold_fan_around_vert_next() is quite cheap in term of
+			 * CPU cycles, so really think it's not worth it. */
+			if (!IS_EDGE_SHARP(e2l_curr) &&
+			    (BLI_BITMAP_TEST(skip_loops, ml_curr_index) ||
+			     !loop_split_generator_check_cyclic_smooth_fan(
+			              mloops, mpolys, edge_to_loops, loop_to_poly, e2l_prev, skip_loops,
+			              ml_curr, ml_prev, ml_curr_index, ml_prev_index, mp_index)))
+			{
+//				printf("SKIPPING!\n");
 			}
 			else {
-				if (threaded) {
+				LoopSplitTaskData *data, data_local;
+
+//				printf("PROCESSING!\n");
+
+				if (pool) {
 					if (data_idx == 0) {
 						data_buff = MEM_callocN(sizeof(*data_buff) * LOOP_SPLIT_TASK_BLOCK_SIZE, __func__);
 					}
 					data = &data_buff[data_idx];
 				}
+				else {
+					data = &data_local;
+					memset(data, 0, sizeof(*data));
+				}
 
 				if (IS_EDGE_SHARP(e2l_curr) && IS_EDGE_SHARP(e2l_prev)) {
 					data->lnor = lnors;
@@ -1091,22 +1179,18 @@ static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const
 					data->mp_index = mp_index;
 					if (lnors_spacearr) {
 						data->lnor_space = BKE_lnor_space_create(lnors_spacearr);
-						/* Tag related vertex as sharp, to avoid fanning around it again (in case it was a smooth one).
-						 * This *has* to be done outside of workers tasks! */
-						BLI_BITMAP_ENABLE(sharp_verts, ml_curr->v);
 					}
 				}
 
-				if (threaded) {
+				if (pool) {
 					data_idx++;
 					if (data_idx == LOOP_SPLIT_TASK_BLOCK_SIZE) {
-						BLI_thread_queue_push(common_data->task_queue, data_buff);
+						BLI_task_pool_push(pool, loop_split_worker, data_buff, true, TASK_PRIORITY_LOW);
 						data_idx = 0;
 					}
 				}
 				else {
 					loop_split_worker_do(common_data, data, edge_vectors);
-					memset(data, 0, sizeof(data_mem));
 				}
 			}
 
@@ -1115,44 +1199,32 @@ static void loop_split_generator_do(LoopSplitTaskDataCommon *common_data, const
 		}
 	}
 
-	if (threaded) {
-		/* Last block of data... Since it is calloc'ed and we use first NULL item as stopper, everything is fine. */
-		if (LIKELY(data_idx)) {
-			BLI_thread_queue_push(common_data->task_queue, data_buff);
-		}
-
-		/* This will signal all other worker threads to wake up and finish! */
-		BLI_thread_queue_nowait(common_data->task_queue);
+	/* Last block of data... Since it is calloc'ed and we use first NULL item as stopper, everything is fine. */
+	if (pool && data_idx) {
+		BLI_task_pool_push(pool, loop_split_worker, data_buff, true, TASK_PRIORITY_LOW);
 	}
 
 	if (edge_vectors) {
 		BLI_stack_free(edge_vectors);
 	}
+	MEM_freeN(skip_loops);
 
 #ifdef DEBUG_TIME
-	TIMEIT_END(loop_split_generator);
+	TIMEIT_END_AVERAGED(loop_split_generator);
 #endif
 }
 
-static void loop_split_generator(TaskPool * __restrict UNUSED(pool), void *taskdata, int UNUSED(threadid))
-{
-	LoopSplitTaskDataCommon *common_data = taskdata;
-
-	loop_split_generator_do(common_data, true);
-}
-
 /**
  * Compute split normals, i.e. vertex normals associated with each poly (hence 'loop normals').
  * Useful to materialize sharp edges (or non-smooth faces) without actually modifying the geometry (splitting edges).
  */
 void BKE_mesh_normals_loop_split(
-        const MVert *mverts, const int numVerts, MEdge *medges, const int numEdges,
+        const MVert *mverts, const int UNUSED(numVerts), MEdge *medges, const int numEdges,
         MLoop *mloops, float (*r_loopnors)[3], const int numLoops,
         MPoly *mpolys, const float (*polynors)[3], const int numPolys,
         const bool use_split_normals, float split_angle,
         MLoopNorSpaceArray *r_lnors_spacearr, short (*clnors_data)[2], int *r_loop_to_poly)
 {
-
 	/* For now this is not supported. If we do not use split normals, we do not generate anything fancy! */
 	BLI_assert(use_split_normals || !(r_lnors_spacearr));
 
@@ -1185,8 +1257,6 @@ void BKE_mesh_normals_loop_split(
 		return;
 	}
 
-	{
-
 	/* Mapping edge -> loops.
 	 * If that edge is used by more than two loops (polys), it is always sharp (and tagged as such, see below).
 	 * We also use the second loop index as a kind of flag: smooth edge: > 0,
@@ -1196,33 +1266,25 @@ void BKE_mesh_normals_loop_split(
 	 * store the negated value of loop index instead of INDEX_INVALID to retrieve the real value later in code).
 	 * Note also that lose edges always have both values set to 0!
 	 */
-	int (*edge_to_loops)[2] = MEM_callocN(sizeof(int[2]) * (size_t)numEdges, __func__);
+	int (*edge_to_loops)[2] = MEM_callocN(sizeof(*edge_to_loops) * (size_t)numEdges, __func__);
 
 	/* Simple mapping from a loop to its polygon index. */
-	int *loop_to_poly = r_loop_to_poly ? r_loop_to_poly : MEM_mallocN(sizeof(int) * (size_t)numLoops, __func__);
+	int *loop_to_poly = r_loop_to_poly ? r_loop_to_poly : MEM_mallocN(sizeof(*loop_to_poly) * (size_t)numLoops, __func__);
 
 	MPoly *mp;
-	int mp_index, me_index;
-	bool check_angle = (split_angle < (float)M_PI);
-	int i;
+	int mp_index;
 
-	BLI_bitmap *sharp_verts = NULL;
-	MLoopNorSpaceArray _lnors_spacearr = {NULL};
+	/* When using custom loop normals, disable the angle feature! */
+	const bool check_angle = (split_angle < (float)M_PI) && (clnors_data == NULL);
 
-	LoopSplitTaskDataCommon common_data = {NULL};
+	MLoopNorSpaceArray _lnors_spacearr = {NULL};
 
 #ifdef DEBUG_TIME
-	TIMEIT_START(BKE_mesh_normals_loop_split);
+	TIMEIT_START_AVERAGED(BKE_mesh_normals_loop_split);
 #endif
 
 	if (check_angle) {
-		/* When using custom loop normals, disable the angle feature! */
-		if (clnors_data) {
-			check_angle = false;
-		}
-		else {
-			split_angle = cosf(split_angle);
-		}
+		split_angle = cosf(split_angle);
 	}
 
 	if (!r_lnors_spacearr && clnors_data) {
@@ -1231,7 +1293,6 @@ void BKE_mesh_normals_loop_split(
 	}
 	if (r_lnors_spacearr) {
 		BKE_lnor_spacearr_init(r_lnors_spacearr, numLoops);
-		sharp_verts = BLI_BITMAP_NEW((size_t)numVerts, __func__);
 	}
 
 	/* This first loop check which edges are actually smooth, and compute edge vectors. */
@@ -1285,60 +1346,38 @@ void BKE_mesh_normals_loop_split(
 		}
 	}
 
-	if (r_lnors_spacearr) {
-		/* Tag vertices that have at least one sharp edge as 'sharp' (used for the lnor spacearr computation).
-		 * XXX This third loop over edges is a bit disappointing, could not find any other way yet.
-		 *     Not really performance-critical anyway.
-		 */
-		for (me_index = 0; me_index < numEdges; me_index++) {
-			const int *e2l = edge_to_loops[me_index];
-			const MEdge *me = &medges[me_index];
-			if (IS_EDGE_SHARP(e2l)) {
-				BLI_BITMAP_ENABLE(sharp_verts, me->v1);
-				BLI_BITMAP_ENABLE(sharp_verts, me->v2);
-			}
-		}
-	}
-
 	/* Init data common to all tasks. */
-	common_data.lnors_spacearr = r_lnors_spacearr;
-	common_data.loopnors = r_loopnors;
-	common_data.clnors_data = clnors_data;
-
-	common_data.mverts = mverts;
-	common_data.medges = medges;
-	common_data.mloops = mloops;
-	common_data.mpolys = mpolys;
-	common_data.sharp_verts = sharp_verts;
-	common_data.edge_to_loops = (const int(*)[2])edge_to_loops;
-	common_data.loop_to_poly = loop_to_poly;
-	common_data.polynors = polynors;
-	common_data.numPolys = numPolys;
+	LoopSplitTaskDataCommon common_data = {
+	    .lnors_spacearr = r_lnors_spacearr,
+	    .loopnors = r_loopnors,
+	    .clnors_data = clnors_data,
+	    .mverts = mverts,
+	    .medges = medges,
+	    .mloops = mloops,
+	    .mpolys = mpolys,
+	    .edge_to_loops = (const int(*)[2])edge_to_loops,
+	    .loop_to_poly = loop_to_poly,
+	    .polynors = polynors,
+	    .numLoops = numLoops,
+	    .numPolys = numPolys,
+	};
 
 	if (numLoops < LOOP_SPLIT_TASK_BLOCK_SIZE * 8) {
 		/* Not enough loops to be worth the whole threading overhead... */
-		loop_split_generator_do(&common_data, false);
+		loop_split_generator(NULL, &common_data);
 	}
 	else {
 		TaskScheduler *task_scheduler;
 		TaskPool *task_pool;
-		int nbr_workers;
-
-		common_data.task_queue = BLI_thread_queue_init();
 
 		task_scheduler = BLI_task_scheduler_get();
-		task_pool = BLI_task_pool_create(task_scheduler, NULL);
+		task_pool = BLI_task_pool_create(task_scheduler, &common_data);
+
+		loop_split_generator(task_pool, &common_data);
 
-		nbr_workers = max_ii(2, BLI_task_scheduler_num_threads(task_scheduler));
-		for (i = 1; i < nbr_workers; i++) {
-			BLI_task_pool_push(task_pool, loop_split_worker, &common_data, false, TASK_PRIORITY_HIGH);
-		}
-		BLI_task_pool_push(task_pool, loop_split_generator, &common_data, false, TASK_PRIORITY_HIGH);
 		BLI_task_pool_work_and_wait(task_pool);
 
 		BLI_task_pool_free(task_pool);
-
-		BLI_thread_queue_free(common_data.task_queue);
 	}
 
 	MEM_freeN(edge_to_loops);
@@ -1347,17 +1386,14 @@ void BKE_mesh_normals_loop_split(
 	}
 
 	if (r_lnors_spacearr) {
-		MEM_freeN(sharp_verts);
 		if (r_lnors_spacearr == &_lnors_spacearr) {
 			BKE_lnor_spacearr_free(r_lnors_spacearr);
 		}
 	}
 
 #ifdef DEBUG_TIME
-	TIMEIT_END(BKE_mesh_normals_loop_split);
+	TIMEIT_END_AVERAGED(BKE_mesh_normals_loop_split);
 #endif
-
-	}
 }
 
 #undef INDEX_UNSET
diff --git a/source/blender/blenkernel/intern/object.c b/source/blender/blenkernel/intern/object.c
index ff8be5892e9..6e754755cf3 100644
--- a/source/blender/blenkernel/intern/object.c
+++ b/source/blender/blenkernel/intern/object.c
@@ -2236,66 +2236,6 @@ void BKE_boundbox_minmax(const BoundBox *bb, float obmat[4][4], float r_min[3],
 	}
 }
 
-/**
- * Returns a BBox which each dimensions are at least epsilon.
- * \note In case a given dimension needs to be enlarged, its final value will be in [epsilon, 3 * epsilon] range.
- *
- * \param bb the input bbox to check.
- * \param bb_temp the temp bbox to modify (\a bb content is never changed).
- * \param epsilon the minimum dimension to ensure.
- * \return either bb (if nothing needed to be changed) or bb_temp.
- */
-BoundBox *BKE_boundbox_ensure_minimum_dimensions(BoundBox *bb, BoundBox *bb_temp, const float epsilon)
-{
-	if (fabsf(bb->vec[0][0] - bb->vec[4][0]) < epsilon) {
-		/* Flat along X axis... */
-		*bb_temp = *bb;
-		bb = bb_temp;
-		bb->vec[0][0] -= epsilon;
-		bb->vec[1][0] -= epsilon;
-		bb->vec[2][0] -= epsilon;
-		bb->vec[3][0] -= epsilon;
-		bb->vec[4][0] += epsilon;
-		bb->vec[5][0] += epsilon;
-		bb->vec[6][0] += epsilon;
-		bb->vec[7][0] += epsilon;
-	}
-
-	if (fabsf(bb->vec[0][1] - bb->vec[3][1]) < epsilon) {
-		/* Flat along Y axis... */
-		if (bb != bb_temp) {
-			*bb_temp = *bb;
-			bb = bb_temp;
-		}
-		bb->vec[0][1] -= epsilon;
-		bb->vec[1][1] -= epsilon;
-		bb->vec[4][1] -= epsilon;
-		bb->vec[5][1] -= epsilon;
-		bb->vec[2][1] += epsilon;
-		bb->vec[3][1] += epsilon;
-		bb->vec[6][1] += epsilon;
-		bb->vec[7][1] += epsilon;
-	}
-
-	if (fabsf(bb->vec[0][2] - bb->vec[1][2]) < epsilon) {
-		/* Flat along Z axis... */
-		if (bb != bb_temp) {
-			*bb_temp = *bb;
-			bb = bb_temp;
-		}
-		bb->vec[0][2] -= epsilon;
-		bb->vec[3][2] -= epsilon;
-		bb->vec[4][2] -= epsilon;
-		bb->vec[7][2] -= epsilon;
-		bb->vec[1][2] += epsilon;
-		bb->vec[2][2] += epsilon;
-		bb->vec[5][2] += epsilon;
-		bb->vec[6][2] += epsilon;
-	}
-
-	return bb;
-}
-
 BoundBox *BKE_object_boundbox_get(Object *ob)
 {
 	BoundBox *bb = NULL;
diff --git a/source/blender/blenkernel/intern/object_update.c b/source/blender/blenkernel/intern/object_update.c
index 5cb704e4737..a531466294e 100644
--- a/source/blender/blenkernel/intern/object_update.c
+++ b/source/blender/blenkernel/intern/object_update.c
@@ -54,6 +54,7 @@
 #include "BKE_editmesh.h"
 #include "BKE_object.h"
 #include "BKE_particle.h"
+#include "BKE_pointcache.h"
 #include "BKE_scene.h"
 #include "BKE_material.h"
 #include "BKE_image.h"
@@ -347,3 +348,9 @@ void BKE_object_eval_uber_data(EvaluationContext *eval_ctx,
 
 	ob->recalc &= ~(OB_RECALC_DATA | OB_RECALC_TIME);
 }
+
+void BKE_object_eval_cloth(EvaluationContext *UNUSED(eval_ctx), Scene *scene, Object *object)
+{
+	DEBUG_PRINT("%s on %s\n", __func__, object->id.name);
+	BKE_ptcache_object_reset(scene, object, PTCACHE_RESET_DEPSGRAPH);
+}
diff --git a/source/blender/blenkernel/intern/particle.c b/source/blender/blenkernel/intern/particle.c
index 1ea27558545..228ae6afaba 100644
--- a/source/blender/blenkernel/intern/particle.c
+++ b/source/blender/blenkernel/intern/particle.c
@@ -633,8 +633,9 @@ void psys_render_set(Object *ob, ParticleSystem *psys, float viewmat[4][4], floa
 	data->childcachebufs.last = psys->childcachebufs.last;
 	data->totchildcache = psys->totchildcache;
 
-	if (psmd->dm_final)
-		data->dm = CDDM_copy(psmd->dm_final);
+	if (psmd->dm_final) {
+		data->dm = CDDM_copy_with_tessface(psmd->dm_final);
+	}
 	data->totdmvert = psmd->totdmvert;
 	data->totdmedge = psmd->totdmedge;
 	data->totdmface = psmd->totdmface;
diff --git a/source/blender/blenkernel/intern/scene.c b/source/blender/blenkernel/intern/scene.c
index 56bfe5d7ff1..906fa0134a0 100644
--- a/source/blender/blenkernel/intern/scene.c
+++ b/source/blender/blenkernel/intern/scene.c
@@ -1510,8 +1510,6 @@ static void scene_update_object_func(TaskPool * __restrict pool, void *taskdata,
 		if (add_to_stats) {
 			StatisicsEntry *entry;
 
-			BLI_assert(threadid < BLI_pool_get_num_threads(pool));
-
 			entry = MEM_mallocN(sizeof(StatisicsEntry), "update thread statistics");
 			entry->object = object;
 			entry->start_time = start_time;
@@ -1631,10 +1629,11 @@ static bool scene_need_update_objects(Main *bmain)
 
 static void scene_update_objects(EvaluationContext *eval_ctx, Main *bmain, Scene *scene, Scene *scene_parent)
 {
-	TaskScheduler *task_scheduler = BLI_task_scheduler_get();
+	TaskScheduler *task_scheduler;
 	TaskPool *task_pool;
 	ThreadedObjectUpdateState state;
 	bool need_singlethread_pass;
+	bool need_free_scheduler;
 
 	/* Early check for whether we need to invoke all the task-based
 	 * things (spawn new ppol, traverse dependency graph and so on).
@@ -1651,6 +1650,15 @@ static void scene_update_objects(EvaluationContext *eval_ctx, Main *bmain, Scene
 	state.scene = scene;
 	state.scene_parent = scene_parent;
 
+	if (G.debug & G_DEBUG_DEPSGRAPH_NO_THREADS) {
+		task_scheduler = BLI_task_scheduler_create(1);
+		need_free_scheduler = true;
+	}
+	else {
+		task_scheduler = BLI_task_scheduler_get();
+		need_free_scheduler = false;
+	}
+
 	/* Those are only needed when blender is run with --debug argument. */
 	if (G.debug & G_DEBUG_DEPSGRAPH) {
 		const int tot_thread = BLI_task_scheduler_num_threads(task_scheduler);
@@ -1665,9 +1673,6 @@ static void scene_update_objects(EvaluationContext *eval_ctx, Main *bmain, Scene
 #endif
 
 	task_pool = BLI_task_pool_create(task_scheduler, &state);
-	if (G.debug & G_DEBUG_DEPSGRAPH_NO_THREADS) {
-		BLI_pool_set_num_threads(task_pool, 1);
-	}
 
 	DAG_threaded_update_begin(scene, scene_update_object_add_task, task_pool);
 	BLI_task_pool_work_and_wait(task_pool);
@@ -1700,6 +1705,10 @@ static void scene_update_objects(EvaluationContext *eval_ctx, Main *bmain, Scene
 	if (need_singlethread_pass) {
 		scene_update_all_bases(eval_ctx, scene, scene_parent);
 	}
+
+	if (need_free_scheduler) {
+		BLI_task_scheduler_free(task_scheduler);
+	}
 }
 
 static void scene_update_tagged_recursive(EvaluationContext *eval_ctx, Main *bmain, Scene *scene, Scene *scene_parent)
diff --git a/source/blender/blenkernel/intern/sequencer.c b/source/blender/blenkernel/intern/sequencer.c
index 1d2f5aee440..2de7cd49901 100644
--- a/source/blender/blenkernel/intern/sequencer.c
+++ b/source/blender/blenkernel/intern/sequencer.c
@@ -480,55 +480,74 @@ void BKE_sequencer_editing_free(Scene *scene)
 
 static void sequencer_imbuf_assign_spaces(Scene *scene, ImBuf *ibuf)
 {
-	if (ibuf->rect_float) {
+	if (ibuf->rect != NULL) {
+		IMB_colormanagement_assign_rect_colorspace(ibuf, scene->sequencer_colorspace_settings.name);
+	}
+	if (ibuf->rect_float != NULL) {
 		IMB_colormanagement_assign_float_colorspace(ibuf, scene->sequencer_colorspace_settings.name);
 	}
 }
 
 void BKE_sequencer_imbuf_to_sequencer_space(Scene *scene, ImBuf *ibuf, bool make_float)
 {
-	const char *from_colorspace = IMB_colormanagement_role_colorspace_name_get(COLOR_ROLE_SCENE_LINEAR);
+	/* Early output check: if both buffers are NULL we have nothing to convert. */
+	if (ibuf->rect_float == NULL && ibuf->rect == NULL) {
+		return;
+	}
+	/* Get common conversion settings. */
 	const char *to_colorspace = scene->sequencer_colorspace_settings.name;
-	const char *float_colorspace = IMB_colormanagement_get_float_colorspace(ibuf);
-
-	if (!ibuf->rect_float) {
-		if (ibuf->rect) {
-			const char *byte_colorspace = IMB_colormanagement_get_rect_colorspace(ibuf);
-			if (make_float || !STREQ(to_colorspace, byte_colorspace)) {
-				/* If byte space is not in sequencer's working space, we deliver float color space,
-				 * this is to to prevent data loss.
-				 */
-
-				/* when converting byte buffer to float in sequencer we need to make float
-				 * buffer be in sequencer's working space, which is currently only doable
-				 * from linear space.
-				 */
-
-				/*
-				 * OCIO_TODO: would be nice to support direct single transform from byte to sequencer's
-				 */
-
-				IMB_float_from_rect(ibuf);
-			}
-			else {
-				return;
-			}
+	/* Perform actual conversion logic. */
+	if (ibuf->rect_float == NULL) {
+		/* We are not requested to give float buffer and byte buffer is already
+		 * in thee required colorspace. Can skip doing anything here.
+		 */
+		const char *from_colorspace = IMB_colormanagement_get_rect_colorspace(ibuf);
+		if (!make_float && STREQ(from_colorspace, to_colorspace)) {
+			return;
+		}
+		if (false) {
+			/* The idea here is to provide as fast playback as possible and
+			 * enforcing float buffer here (a) uses more cache memory (b) might
+			 * make some other effects slower to apply.
+			 *
+			 * However, this might also have negative effect by adding weird
+			 * artifacts which will then not happen in final render.
+			 */
+			IMB_colormanagement_transform_byte_threaded(
+			        (unsigned char *)ibuf->rect, ibuf->x, ibuf->y, ibuf->channels,
+			        from_colorspace, to_colorspace);
 		}
 		else {
-			return;
+			/* We perform conversion to a float buffer so we don't worry about
+			 * precision loss.
+			 */
+			imb_addrectfloatImBuf(ibuf);
+			IMB_colormanagement_transform_from_byte_threaded(
+			        ibuf->rect_float, (unsigned char *)ibuf->rect,
+			        ibuf->x, ibuf->y, ibuf->channels,
+			        from_colorspace, to_colorspace);
+			/* We don't need byte buffer anymore. */
+			imb_freerectImBuf(ibuf);
 		}
 	}
-
-	if (from_colorspace && from_colorspace[0] != '\0') {
-		if (ibuf->rect)
+	else {
+		const char *from_colorspace = IMB_colormanagement_get_float_colorspace(ibuf);
+		/* Unknown input color space, can't perform conversion. */
+		if (from_colorspace == NULL || from_colorspace[0] == '\0') {
+			return;
+		}
+		/* We don't want both byte and float buffers around: they'll either run
+		 * out of sync or conversion of byte buffer will loose precision in there.
+		 */
+		if (ibuf->rect != NULL) {
 			imb_freerectImBuf(ibuf);
-
-		if (!STREQ(float_colorspace, to_colorspace)) {
-			IMB_colormanagement_transform_threaded(ibuf->rect_float, ibuf->x, ibuf->y, ibuf->channels,
-			                                       from_colorspace, to_colorspace, true);
-			sequencer_imbuf_assign_spaces(scene, ibuf);
 		}
+		IMB_colormanagement_transform_threaded(ibuf->rect_float,
+		                                       ibuf->x, ibuf->y, ibuf->channels,
+		                                       from_colorspace, to_colorspace,
+		                                       true);
 	}
+	sequencer_imbuf_assign_spaces(scene, ibuf);
 }
 
 void BKE_sequencer_imbuf_from_sequencer_space(Scene *scene, ImBuf *ibuf)
diff --git a/source/blender/blenkernel/intern/text.c b/source/blender/blenkernel/intern/text.c
index 88575c7d3be..298790a9e9b 100644
--- a/source/blender/blenkernel/intern/text.c
+++ b/source/blender/blenkernel/intern/text.c
@@ -1935,7 +1935,7 @@ void txt_do_undo(Text *text)
 	int op = text->undo_buf[text->undo_pos];
 	int prev_flags;
 	unsigned int linep;
-	unsigned int uchar;
+	unsigned int uni_char;
 	unsigned int curln, selln;
 	unsigned short curc, selc;
 	unsigned short charp;
@@ -1971,14 +1971,14 @@ void txt_do_undo(Text *text)
 		case UNDO_BS_3:
 		case UNDO_BS_4:
 			charp = op - UNDO_BS_1 + 1;
-			uchar = txt_undo_read_unicode(text->undo_buf, &text->undo_pos, charp);
+			uni_char = txt_undo_read_unicode(text->undo_buf, &text->undo_pos, charp);
 			
 			/* get and restore the cursors */
 			txt_undo_read_cur(text->undo_buf, &text->undo_pos, &curln, &curc);
 			txt_move_to(text, curln, curc, 0);
 			txt_move_to(text, curln, curc, 1);
 			
-			txt_add_char(text, uchar);
+			txt_add_char(text, uni_char);
 
 			text->undo_pos--;
 			break;
@@ -1988,14 +1988,14 @@ void txt_do_undo(Text *text)
 		case UNDO_DEL_3:
 		case UNDO_DEL_4:
 			charp = op - UNDO_DEL_1 + 1;
-			uchar = txt_undo_read_unicode(text->undo_buf, &text->undo_pos, charp);
+			uni_char = txt_undo_read_unicode(text->undo_buf, &text->undo_pos, charp);
 
 			/* get and restore the cursors */
 			txt_undo_read_cur(text->undo_buf, &text->undo_pos, &curln, &curc);
 			txt_move_to(text, curln, curc, 0);
 			txt_move_to(text, curln, curc, 1);
 
-			txt_add_char(text, uchar);
+			txt_add_char(text, uni_char);
 
 			txt_move_left(text, 0);
 
@@ -2163,7 +2163,7 @@ void txt_do_redo(Text *text)
 	char *buf;
 	unsigned int linep;
 	unsigned short charp;
-	unsigned int uchar;
+	unsigned int uni_uchar;
 	unsigned int curln, selln;
 	unsigned short curc, selc;
 	
@@ -2190,9 +2190,9 @@ void txt_do_redo(Text *text)
 			txt_move_to(text, curln, curc, 1);
 			
 			charp = op - UNDO_INSERT_1 + 1;
-			uchar = txt_redo_read_unicode(text->undo_buf, &text->undo_pos, charp);
+			uni_uchar = txt_redo_read_unicode(text->undo_buf, &text->undo_pos, charp);
 
-			txt_add_char(text, uchar);
+			txt_add_char(text, uni_uchar);
 			break;
 
 		case UNDO_BS_1:
diff --git a/source/blender/blenkernel/intern/texture.c b/source/blender/blenkernel/intern/texture.c
index 2d3ecad19ad..60990c03b0e 100644
--- a/source/blender/blenkernel/intern/texture.c
+++ b/source/blender/blenkernel/intern/texture.c
@@ -1485,9 +1485,11 @@ bool BKE_texture_dependsOnTime(const struct Tex *texture)
 
 /* ------------------------------------------------------------------------- */
 
-void BKE_texture_get_value(
+void BKE_texture_get_value_ex(
         const Scene *scene, Tex *texture,
-        float *tex_co, TexResult *texres, bool use_color_management)
+        float *tex_co, TexResult *texres,
+        struct ImagePool *pool,
+        bool use_color_management)
 {
 	int result_type;
 	bool do_color_manage = false;
@@ -1497,7 +1499,7 @@ void BKE_texture_get_value(
 	}
 
 	/* no node textures for now */
-	result_type = multitex_ext_safe(texture, tex_co, texres, NULL, do_color_manage, false);
+	result_type = multitex_ext_safe(texture, tex_co, texres, pool, do_color_manage, false);
 
 	/* if the texture gave an RGB value, we assume it didn't give a valid
 	 * intensity, since this is in the context of modifiers don't use perceptual color conversion.
@@ -1510,3 +1512,10 @@ void BKE_texture_get_value(
 		copy_v3_fl(&texres->tr, texres->tin);
 	}
 }
+
+void BKE_texture_get_value(
+        const Scene *scene, Tex *texture,
+        float *tex_co, TexResult *texres, bool use_color_management)
+{
+	BKE_texture_get_value_ex(scene, texture, tex_co, texres, NULL, use_color_management);
+}
diff --git a/source/blender/blenkernel/intern/tracking_stabilize.c b/source/blender/blenkernel/intern/tracking_stabilize.c
index 36b24fbb2dc..3dfaa1ed77d 100644
--- a/source/blender/blenkernel/intern/tracking_stabilize.c
+++ b/source/blender/blenkernel/intern/tracking_stabilize.c
@@ -755,7 +755,8 @@ static void average_marker_positions(StabContext *ctx, int framenr, float r_ref_
 	if (ok) {
 		r_ref_pos[0] /= weight_sum;
 		r_ref_pos[1] /= weight_sum;
-	} else {
+	}
+	else {
 		/* No usable tracking data on any track on this frame.
 		 * Use data from neighbouring frames to extrapolate...
 		 */
@@ -784,7 +785,8 @@ static void average_marker_positions(StabContext *ctx, int framenr, float r_ref_
 			 * Also default to this frame when we're in a gap */
 			average_marker_positions(ctx, next_lower, r_ref_pos);
 
-		} else if (next_higher < MAXFRAME) {
+		}
+		else if (next_higher < MAXFRAME) {
 			average_marker_positions(ctx, next_higher, r_ref_pos);
 		}
 		use_values_from_fcurves(ctx, false);
@@ -1167,7 +1169,8 @@ static void stabilization_calculate_data(StabContext *ctx,
 
 	if (ctx->stab->flag & TRACKING_STABILIZE_SCALE) {
 		*r_scale = expf(scale_step * scaleinf);  /* Averaged in log scale */
-	} else {
+	}
+	else {
 		*r_scale = 1.0f;
 	}
 
@@ -1180,8 +1183,8 @@ static void stabilization_calculate_data(StabContext *ctx,
 	 */
 	get_animated_target_pos(ctx, framenr, target_pos);
 	sub_v2_v2(r_translation, target_pos);
-	*r_angle -= get_animated_target_rot(ctx,framenr);
-	target_scale = get_animated_target_scale(ctx,framenr);
+	*r_angle -= get_animated_target_rot(ctx, framenr);
+	target_scale = get_animated_target_scale(ctx, framenr);
 	if (target_scale != 0.0f) {
 		*r_scale /= target_scale;
 		/* target_scale is an expected/intended reference zoom value */
diff --git a/source/blender/blenkernel/intern/writeffmpeg.c b/source/blender/blenkernel/intern/writeffmpeg.c
index 9994d479ce7..156b74f5c3d 100644
--- a/source/blender/blenkernel/intern/writeffmpeg.c
+++ b/source/blender/blenkernel/intern/writeffmpeg.c
@@ -568,7 +568,8 @@ static AVStream *alloc_video_stream(FFMpegContext *context, RenderData *rd, int
 
 	if (context->ffmpeg_crf >= 0) {
 		ffmpeg_dict_set_int(&opts, "crf", context->ffmpeg_crf);
-	} else {
+	}
+	else {
 		c->bit_rate = context->ffmpeg_video_bitrate * 1000;
 		c->rc_max_rate = rd->ffcodecdata.rc_max_rate * 1000;
 		c->rc_min_rate = rd->ffcodecdata.rc_min_rate * 1000;
@@ -576,8 +577,8 @@ static AVStream *alloc_video_stream(FFMpegContext *context, RenderData *rd, int
 	}
 
 	if (context->ffmpeg_preset) {
-		char const * preset_name;
-		switch(context->ffmpeg_preset) {
+		char const *preset_name;
+		switch (context->ffmpeg_preset) {
 			case FFM_PRESET_ULTRAFAST: preset_name = "ultrafast"; break;
 			case FFM_PRESET_SUPERFAST: preset_name = "superfast"; break;
 			case FFM_PRESET_VERYFAST: preset_name = "veryfast"; break;
diff --git a/source/blender/blenlib/BLI_kdopbvh.h b/source/blender/blenlib/BLI_kdopbvh.h
index 91d39801645..ba565fca522 100644
--- a/source/blender/blenlib/BLI_kdopbvh.h
+++ b/source/blender/blenlib/BLI_kdopbvh.h
@@ -95,10 +95,6 @@ typedef void (*BVHTree_NearestPointCallback)(void *userdata, int index, const fl
 /* callback must update hit in case it finds a nearest successful hit */
 typedef void (*BVHTree_RayCastCallback)(void *userdata, int index, const BVHTreeRay *ray, BVHTreeRayHit *hit);
 
-/* callback must update nearest in case it finds a nearest result */
-typedef void (*BVHTree_NearestToRayCallback)(void *userdata, const float ray_co[3], const float ray_dir[3],
-                                             const float scale[3], int index, BVHTreeNearest *nearest);
-
 /* callback to check if 2 nodes overlap (use thread if intersection results need to be stored) */
 typedef bool (*BVHTree_OverlapCallback)(void *userdata, int index_a, int index_b, int thread);
 
@@ -143,18 +139,6 @@ int BLI_bvhtree_find_nearest(
         BVHTree *tree, const float co[3], BVHTreeNearest *nearest,
         BVHTree_NearestPointCallback callback, void *userdata);
 
-int BLI_bvhtree_find_nearest_to_ray_angle(
-        BVHTree *tree, const float co[3], const float dir[3],
-        const bool ray_is_normalized, const float scale[3],
-        BVHTreeNearest *nearest,
-        BVHTree_NearestToRayCallback callback, void *userdata);
-
-int BLI_bvhtree_find_nearest_to_ray(
-        BVHTree *tree, const float co[3], const float dir[3],
-        const bool ray_is_normalized, const float scale[3],
-        BVHTreeNearest *nearest,
-        BVHTree_NearestToRayCallback callback, void *userdata);
-
 int BLI_bvhtree_ray_cast_ex(
         BVHTree *tree, const float co[3], const float dir[3], float radius, BVHTreeRayHit *hit,
         BVHTree_RayCastCallback callback, void *userdata,
diff --git a/source/blender/blenlib/BLI_math_geom.h b/source/blender/blenlib/BLI_math_geom.h
index 4a85e859c16..f1d9c9571f2 100644
--- a/source/blender/blenlib/BLI_math_geom.h
+++ b/source/blender/blenlib/BLI_math_geom.h
@@ -298,23 +298,6 @@ bool isect_ray_aabb_v3_simple(
         const float bb_min[3], const float bb_max[3],
         float *tmin, float *tmax);
 
-struct NearestRayToAABB_Precalc {
-	float ray_origin[3];
-	float ray_direction[3];
-	float ray_inv_dir[3];
-	float cdot_axis[3];
-	float idiag_sq[3];
-	bool sign[3];
-};
-
-void dist_squared_ray_to_aabb_v3_precalc(
-        struct NearestRayToAABB_Precalc *data,
-        const float ray_origin[3], const float ray_direction[3]);
-float dist_squared_ray_to_aabb_v3(
-        const struct NearestRayToAABB_Precalc *data,
-        const float bb_min[3], const float bb_max[3],
-        bool r_axis_closest[3]);
-
 /* other */
 bool isect_sweeping_sphere_tri_v3(const float p1[3], const float p2[3], const float radius,
                                   const float v0[3], const float v1[3], const float v2[3], float *r_lambda, float ipoint[3]);
diff --git a/source/blender/blenlib/BLI_path_util.h b/source/blender/blenlib/BLI_path_util.h
index baa1f792018..b6a55d34d14 100644
--- a/source/blender/blenlib/BLI_path_util.h
+++ b/source/blender/blenlib/BLI_path_util.h
@@ -39,14 +39,6 @@ extern "C" {
 
 struct ListBase;
 
-#ifdef WIN32
-#define SEP '\\'
-#define ALTSEP '/'
-#else
-#define SEP '/'
-#define ALTSEP '\\'
-#endif
-
 void BLI_setenv(const char *env, const char *val) ATTR_NONNULL(1);
 void BLI_setenv_if_new(const char *env, const char *val) ATTR_NONNULL(1);
 
@@ -60,7 +52,13 @@ void BLI_path_append(char *__restrict dst, const size_t maxlen,
                      const char *__restrict file) ATTR_NONNULL();
 void BLI_join_dirfile(char *__restrict string, const size_t maxlen,
                       const char *__restrict dir, const char *__restrict file) ATTR_NONNULL();
+size_t BLI_path_join(
+        char *__restrict dst, const size_t dst_len,
+        const char *path_first, ...) ATTR_NONNULL(1, 3) ATTR_SENTINEL(0);
 const char *BLI_path_basename(const char *path) ATTR_NONNULL() ATTR_WARN_UNUSED_RESULT;
+bool BLI_path_name_at_index(
+        const char *__restrict path, const int index,
+        int *__restrict r_offset, int *__restrict r_len) ATTR_NONNULL() ATTR_WARN_UNUSED_RESULT;
 
 #if 0
 typedef enum bli_rebase_state {
@@ -83,7 +81,6 @@ bool BLI_path_program_extensions_add_win32(char *name, const size_t maxlen);
 #endif
 bool BLI_path_program_search(char *fullname, const size_t maxlen, const char *name);
 
-void BLI_getlastdir(const char *dir, char *last, const size_t maxlen);
 bool BLI_testextensie(const char *str, const char *ext) ATTR_NONNULL() ATTR_WARN_UNUSED_RESULT;
 bool BLI_testextensie_n(const char *str, ...) ATTR_NONNULL(1) ATTR_SENTINEL(0);
 bool BLI_testextensie_array(const char *str, const char **ext_array) ATTR_NONNULL() ATTR_WARN_UNUSED_RESULT;
@@ -143,6 +140,18 @@ bool BLI_path_suffix(char *string, size_t maxlen, const char *suffix, const char
 #  define FILE_MAX            1024
 #endif
 
+#ifdef WIN32
+#  define SEP        '\\'
+#  define ALTSEP     '/'
+#  define SEP_STR    "\\"
+#  define ALTSEP_STR "/"
+#else
+#  define SEP        '/'
+#  define ALTSEP     '\\'
+#  define SEP_STR    "/"
+#  define ALTSEP_STR "\\"
+#endif
+
 /* Parent and current dir helpers. */
 #define FILENAME_PARENT ".."
 #define FILENAME_CURRENT "."
diff --git a/source/blender/blenlib/BLI_rect.h b/source/blender/blenlib/BLI_rect.h
index 59bf3644912..041679ef876 100644
--- a/source/blender/blenlib/BLI_rect.h
+++ b/source/blender/blenlib/BLI_rect.h
@@ -47,6 +47,8 @@ bool BLI_rcti_is_empty(const struct rcti *rect);
 bool BLI_rctf_is_empty(const struct rctf *rect);
 void BLI_rctf_init(struct rctf *rect, float xmin, float xmax, float ymin, float ymax);
 void BLI_rcti_init(struct rcti *rect, int xmin, int xmax, int ymin, int ymax);
+void BLI_rctf_init_pt_radius(struct rctf *rect, const float xy[2], float size);
+void BLI_rcti_init_pt_radius(struct rcti *rect, const int xy[2], int size);
 void BLI_rcti_init_minmax(struct rcti *rect);
 void BLI_rctf_init_minmax(struct rctf *rect);
 void BLI_rcti_do_minmax_v(struct rcti *rect, const int xy[2]);
diff --git a/source/blender/blenlib/BLI_sys_types.h b/source/blender/blenlib/BLI_sys_types.h
index 7929e1d6551..9d8222c575d 100644
--- a/source/blender/blenlib/BLI_sys_types.h
+++ b/source/blender/blenlib/BLI_sys_types.h
@@ -80,6 +80,11 @@ typedef uint64_t u_int64_t;
 #include <stddef.h>  /* size_t define */
 #include <stdbool.h>
 
+typedef unsigned int uint;
+typedef unsigned short ushort;
+typedef unsigned long ulong;
+typedef unsigned char uchar;
+
 #ifdef __cplusplus 
 }
 #endif
diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h
index 967e0be6d0a..c3c587275e1 100644
--- a/source/blender/blenlib/BLI_task.h
+++ b/source/blender/blenlib/BLI_task.h
@@ -81,6 +81,7 @@ typedef void (*TaskFreeFunction)(TaskPool *__restrict pool, void *taskdata, int
 
 TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata);
 TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata);
+TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata);
 void BLI_task_pool_free(TaskPool *pool);
 
 void BLI_task_pool_push_ex(
@@ -95,14 +96,6 @@ void BLI_task_pool_push_from_thread(TaskPool *pool, TaskRunFunction run,
 void BLI_task_pool_work_and_wait(TaskPool *pool);
 /* cancel all tasks, keep worker threads running */
 void BLI_task_pool_cancel(TaskPool *pool);
-/* stop all worker threads */
-void BLI_task_pool_stop(TaskPool *pool);
-
-/* get number of threads allowed to be used by this pool */
-int BLI_pool_get_num_threads(TaskPool *pool);
-
-/* set number of threads allowed to be used by this pool */
-void BLI_pool_set_num_threads(TaskPool *pool, int num_threads);
 
 /* for worker threads, test if canceled */
 bool BLI_task_pool_canceled(TaskPool *pool);
@@ -113,9 +106,6 @@ void *BLI_task_pool_userdata(TaskPool *pool);
 /* optional mutex to use from run function */
 ThreadMutex *BLI_task_pool_user_mutex(TaskPool *pool);
 
-/* number of tasks done, for stats, don't use this to make decisions */
-size_t BLI_task_pool_tasks_done(TaskPool *pool);
-
 /* Parallel for routines */
 typedef void (*TaskParallelRangeFunc)(void *userdata, const int iter);
 typedef void (*TaskParallelRangeFuncEx)(void *userdata, void *userdata_chunk, const int iter, const int thread_id);
diff --git a/source/blender/blenlib/PIL_time_utildefines.h b/source/blender/blenlib/PIL_time_utildefines.h
index 9157e04a7bf..412cfb3a090 100644
--- a/source/blender/blenlib/PIL_time_utildefines.h
+++ b/source/blender/blenlib/PIL_time_utildefines.h
@@ -80,9 +80,10 @@
 		}                                                                     \
 		const float _delta_##var = TIMEIT_VALUE(var);                         \
 		_sum_##var += _delta_##var;                                           \
+		_num_##var++;                                                         \
 		printf("time end      (" #var "): %.6f" "  " AT "\n", _delta_##var);  \
-		printf("time averaged (" #var "): %.6f" "  " AT "\n",                 \
-		       (_sum_##var / ++_num_##var));                                  \
+		printf("time averaged (" #var "): %.6f (total: %.6f, in %d runs)\n",  \
+		       (_sum_##var / _num_##var), _sum_##var, (int)_num_##var);       \
 		fflush(stdout);                                                       \
 	} (void)0
 
diff --git a/source/blender/blenlib/intern/BLI_kdopbvh.c b/source/blender/blenlib/intern/BLI_kdopbvh.c
index b14007a88cb..19d9711922e 100644
--- a/source/blender/blenlib/intern/BLI_kdopbvh.c
+++ b/source/blender/blenlib/intern/BLI_kdopbvh.c
@@ -159,29 +159,6 @@ typedef struct BVHRayCastData {
 	BVHTreeRayHit hit;
 } BVHRayCastData;
 
-typedef struct BVHNearestRayData {
-	BVHTree *tree;
-	BVHTree_NearestToRayCallback callback;
-	void    *userdata;
-
-	struct {
-		bool sign[3];
-		float origin[3];
-		float direction[3];
-
-		float direction_scaled_square[3];
-		float inv_dir[3];
-
-		float cdot_axis[3];
-	} ray;
-
-	bool pick_smallest[3];
-
-	BVHTreeNearest nearest;
-
-	float scale[3];
-} BVHNearestRayData;
-
 /** \} */
 
 
@@ -1900,453 +1877,6 @@ void BLI_bvhtree_ray_cast_all(
 
 /* -------------------------------------------------------------------- */
 
-/** \name BLI_bvhtree_find_nearest_to_ray functions
- *
- * \{ */
-
-static void dist_squared_ray_to_aabb_scaled_v3_precalc(
-        BVHNearestRayData *data,
-        const float ray_origin[3], const float ray_direction[3],
-        const bool ray_is_normalized, const float scale[3])
-{
-	if (scale) {
-		copy_v3_v3(data->scale, scale);
-	}
-	else {
-		copy_v3_fl(data->scale, 1.0f);
-	}
-	/* un-normalize ray */
-	if (ray_is_normalized && scale &&
-	    (data->scale[0] != 1.0f || data->scale[1] != 1.0f || data->scale[2] != 1.0f))
-	{
-		data->ray.direction[0] = ray_direction[0] * data->scale[0];
-		data->ray.direction[1] = ray_direction[1] * data->scale[1];
-		data->ray.direction[2] = ray_direction[2] * data->scale[2];
-
-		mul_v3_v3fl(data->ray.direction, ray_direction, 1 / len_v3(data->ray.direction));
-	}
-	else {
-		copy_v3_v3(data->ray.direction, ray_direction);
-	}
-
-	float dir_sq[3];
-
-	for (int i = 0; i < 3; i++) {
-		data->ray.origin[i] = ray_origin[i];
-		data->ray.inv_dir[i] = (data->ray.direction[i] != 0.0f) ?
-		                       (1.0f / data->ray.direction[i]) : FLT_MAX;
-		/* It has to be in function of `ray.inv_dir`,
-		 * since the division of 1 by 0.0f, can be -inf or +inf */
-		data->ray.sign[i] = (data->ray.inv_dir[i] < 0.0f);
-
-		data->ray.direction_scaled_square[i] = data->ray.direction[i] * data->scale[i];
-
-		dir_sq[i] = SQUARE(data->ray.direction_scaled_square[i]);
-
-		data->ray.direction_scaled_square[i] *= data->scale[i];
-	}
-
-	/* `diag_sq` Length square of each face diagonal */
-	float diag_sq[3] = {
-		dir_sq[1] + dir_sq[2],
-		dir_sq[0] + dir_sq[2],
-		dir_sq[0] + dir_sq[1],
-	};
-
-	data->ray.cdot_axis[0] = (diag_sq[0] != 0.0f) ? data->ray.direction[0] / diag_sq[0] : FLT_MAX;
-	data->ray.cdot_axis[1] = (diag_sq[1] != 0.0f) ? data->ray.direction[1] / diag_sq[1] : FLT_MAX;
-	data->ray.cdot_axis[2] = (diag_sq[2] != 0.0f) ? data->ray.direction[2] / diag_sq[2] : FLT_MAX;
-}
-
-/**
- * Returns the squared distance from a ray to a bound-box `AABB`.
- * It is based on `fast_ray_nearest_hit` solution to obtain
- * the coordinates of the nearest edge of Bound Box to the ray
- */
-MINLINE float dist_squared_ray_to_aabb_scaled_v3__impl(
-        const BVHNearestRayData *data,
-        const float bv[6], float *r_depth_sq, bool r_axis_closest[3])
-{
-
-	/* `tmin` is a vector that has the smaller distances to each of the
-	 * infinite planes of the `AABB` faces (hit in nearest face X plane,
-	 * nearest face Y plane and nearest face Z plane) */
-	float local_bvmin[3], local_bvmax[3];
-
-	if (data->ray.sign[0]) {
-		local_bvmin[0] = bv[1];
-		local_bvmax[0] = bv[0];
-	}
-	else {
-		local_bvmin[0] = bv[0];
-		local_bvmax[0] = bv[1];
-	}
-
-	if (data->ray.sign[1]) {
-		local_bvmin[1] = bv[3];
-		local_bvmax[1] = bv[2];
-	}
-	else {
-		local_bvmin[1] = bv[2];
-		local_bvmax[1] = bv[3];
-	}
-
-	if (data->ray.sign[2]) {
-		local_bvmin[2] = bv[5];
-		local_bvmax[2] = bv[4];
-	}
-	else {
-		local_bvmin[2] = bv[4];
-		local_bvmax[2] = bv[5];
-	}
-
-	sub_v3_v3(local_bvmin, data->ray.origin);
-	sub_v3_v3(local_bvmax, data->ray.origin);
-
-	const float tmin[3] = {
-		local_bvmin[0] * data->ray.inv_dir[0],
-		local_bvmin[1] * data->ray.inv_dir[1],
-		local_bvmin[2] * data->ray.inv_dir[2],
-	};
-
-	/* `tmax` is a vector that has the longer distances to each of the
-	 * infinite planes of the `AABB` faces (hit in farthest face X plane,
-	 * farthest face Y plane and farthest face Z plane) */
-	const float tmax[3] = {
-		local_bvmax[0] * data->ray.inv_dir[0],
-		local_bvmax[1] * data->ray.inv_dir[1],
-		local_bvmax[2] * data->ray.inv_dir[2],
-	};
-	/* `v1` and `v3` is be the coordinates of the nearest `AABB` edge to the ray*/
-	float v1[3], v2[3];
-	/* `rtmin` is the highest value of the smaller distances. == max_axis_v3(tmin)
-	 * `rtmax` is the lowest value of longer distances. == min_axis_v3(tmax)*/
-	float rtmin, rtmax, mul;
-	/* `main_axis` is the axis equivalent to edge close to the ray */
-	int main_axis;
-
-	r_axis_closest[0] = false;
-	r_axis_closest[1] = false;
-	r_axis_closest[2] = false;
-
-	/* *** min_axis_v3(tmax) *** */
-	if ((tmax[0] <= tmax[1]) && (tmax[0] <= tmax[2])) {
-		// printf("# Hit in X %s\n", data->sign[0] ? "min", "max");
-		rtmax = tmax[0];
-		v1[0] = v2[0] = local_bvmax[0];
-		mul = local_bvmax[0] * data->ray.direction_scaled_square[0];
-		main_axis = 3;
-		r_axis_closest[0] = data->ray.sign[0];
-	}
-	else if ((tmax[1] <= tmax[0]) && (tmax[1] <= tmax[2])) {
-		// printf("# Hit in Y %s\n", data->sign[1] ? "min", "max");
-		rtmax = tmax[1];
-		v1[1] = v2[1] = local_bvmax[1];
-		mul = local_bvmax[1] * data->ray.direction_scaled_square[1];
-		main_axis = 2;
-		r_axis_closest[1] = data->ray.sign[1];
-	}
-	else {
-		// printf("# Hit in Z %s\n", data->sign[2] ? "min", "max");
-		rtmax = tmax[2];
-		v1[2] = v2[2] = local_bvmax[2];
-		mul = local_bvmax[2] * data->ray.direction_scaled_square[2];
-		main_axis = 1;
-		r_axis_closest[2] = data->ray.sign[2];
-	}
-
-	/* *** max_axis_v3(tmin) *** */
-	if ((tmin[0] >= tmin[1]) && (tmin[0] >= tmin[2])) {
-		// printf("# To X %s\n", data->sign[0] ? "max", "min");
-		rtmin = tmin[0];
-		v1[0] = v2[0] = local_bvmin[0];
-		mul += local_bvmin[0] * data->ray.direction_scaled_square[0];
-		main_axis -= 3;
-		r_axis_closest[0] = !data->ray.sign[0];
-	}
-	else if ((tmin[1] >= tmin[0]) && (tmin[1] >= tmin[2])) {
-		// printf("# To Y %s\n", data->sign[1] ? "max", "min");
-		rtmin = tmin[1];
-		v1[1] = v2[1] = local_bvmin[1];
-		mul += local_bvmin[1] * data->ray.direction_scaled_square[1];
-		main_axis -= 1;
-		r_axis_closest[1] = !data->ray.sign[1];
-	}
-	else {
-		// printf("# To Z %s\n", data->sign[2] ? "max", "min");
-		rtmin = tmin[2];
-		v1[2] = v2[2] = local_bvmin[2];
-		mul += local_bvmin[2] * data->ray.direction_scaled_square[2];
-		main_axis -= 2;
-		r_axis_closest[2] = !data->ray.sign[2];
-	}
-	/* *** end min/max axis *** */
-
-	if (main_axis < 0)
-		main_axis += 3;
-
-	/* if rtmin < rtmax, ray intersect `AABB` */
-	if (rtmin <= rtmax) {
-#ifdef IGNORE_BEHIND_RAY
-		/* `if rtmax < depth_min`, the whole `AABB` is behind us */
-		if (rtmax < min_depth) {
-			return fallback;
-		}
-#endif
-		const float proj = rtmin * data->ray.direction[main_axis];
-
-		if (data->ray.sign[main_axis])
-			r_axis_closest[main_axis] = (proj - local_bvmax[main_axis]) < (local_bvmin[main_axis] - proj);
-		else
-			r_axis_closest[main_axis] = (proj - local_bvmin[main_axis]) < (local_bvmax[main_axis] - proj);
-
-		//if (r_depth_sq)
-		//	*r_depth_sq = SQUARE(rtmin);
-
-		return 0.0f;
-	}
-#ifdef IGNORE_BEHIND_RAY
-	/* `if rtmin < depth_min`, the whole `AABB` is behing us */
-	else if (rtmin < min_depth) {
-		return fallback;
-	}
-#endif
-
-	if (data->ray.sign[main_axis]) {
-		v1[main_axis] = local_bvmax[main_axis];
-		v2[main_axis] = local_bvmin[main_axis];
-	}
-	else {
-		v1[main_axis] = local_bvmin[main_axis];
-		v2[main_axis] = local_bvmax[main_axis];
-	}
-	{
-		/* `proj` equals to nearest point on the ray closest to the edge `v1 v2` of the `AABB`. */
-		const float proj = mul * data->ray.cdot_axis[main_axis];
-		float depth_sq, r_point[3];
-		if (v1[main_axis] > proj) { /* the nearest point to the ray is the point v1 */
-			r_axis_closest[main_axis] = true;
-			/* `depth` is equivalent the distance of the the projection of v1 on the ray */
-			depth_sq = mul + data->ray.direction_scaled_square[main_axis] * v1[main_axis];
-
-			copy_v3_v3(r_point, v1);
-		}
-		else if (v2[main_axis] < proj) { /* the nearest point of the ray is the point v2 */
-			r_axis_closest[main_axis] = false;
-
-			depth_sq = mul + data->ray.direction_scaled_square[main_axis] * v2[main_axis];
-
-			copy_v3_v3(r_point, v2);
-		}
-		else {  /* the nearest point of the ray is on the edge of the `AABB`. */
-			r_axis_closest[main_axis] = (proj - v1[main_axis]) < (v2[main_axis] - proj);
-
-			depth_sq = mul + data->ray.direction_scaled_square[main_axis] * proj;
-#if 0
-			r_point[0] = main_axis == 0 ? proj : v2[0];
-			r_point[1] = main_axis == 1 ? proj : v2[1];
-			r_point[2] = main_axis == 2 ? proj : v2[2];
-#else
-			v2[main_axis] = proj;
-			copy_v3_v3(r_point, v2);
-#endif
-		}
-		depth_sq *= depth_sq;
-
-		if (r_depth_sq)
-			*r_depth_sq = depth_sq;
-
-		/* TODO: scale can be optional */
-		r_point[0] *= data->scale[0];
-		r_point[1] *= data->scale[1];
-		r_point[2] *= data->scale[2];
-
-		return len_squared_v3(r_point) - depth_sq;
-	}
-}
-
-/**
- * <pre>
- *  + r_point
- *  |
- *  | dist
- *  |
- *  +----depth----+orig <-- dir
- *
- * tangent = dist/depth
- * </pre>
- */
-static float calc_tangent_sq(BVHNearestRayData *data, BVHNode *node)
-{
-	float depth_sq;
-	const float dist_sq = dist_squared_ray_to_aabb_scaled_v3__impl(
-	        data, node->bv, &depth_sq, data->pick_smallest);
-
-	return (dist_sq != 0.0f) ? (dist_sq / depth_sq) : 0.0f;
-}
-
-static float calc_dist_sq_to_ray(BVHNearestRayData *data, BVHNode *node)
-{
-	return dist_squared_ray_to_aabb_scaled_v3__impl(
-	        data, node->bv, NULL,
-	        data->pick_smallest);
-}
-
-static void dfs_find_lowest_tangent_dfs(BVHNearestRayData *data, BVHNode *node)
-{
-	if (node->totnode == 0) {
-		if (data->callback) {
-			data->callback(data->userdata, data->ray.origin, data->ray.direction,
-			               data->scale, node->index, &data->nearest);
-		}
-		else {
-			data->nearest.index = node->index;
-			data->nearest.dist_sq = calc_tangent_sq(data, node);
-			/* TODO: return a value to the data->nearest.co
-			 * not urgent however since users currently define own callbacks */
-		}
-	}
-	else {
-		int i;
-		/* First pick the closest node to dive on */
-		if (data->pick_smallest[node->main_axis]) {
-			for (i = 0; i != node->totnode; i++) {
-				if (calc_tangent_sq(data, node->children[i]) < data->nearest.dist_sq) {
-					dfs_find_lowest_tangent_dfs(data, node->children[i]);
-				}
-			}
-		}
-		else {
-			for (i = node->totnode - 1; i >= 0; i--) {
-				if (calc_tangent_sq(data, node->children[i]) < data->nearest.dist_sq) {
-					dfs_find_lowest_tangent_dfs(data, node->children[i]);
-				}
-			}
-		}
-	}
-}
-
-static void dfs_find_nearest_to_ray_dfs(BVHNearestRayData *data, BVHNode *node)
-{
-	if (node->totnode == 0) {
-		if (data->callback) {
-			data->callback(data->userdata, data->ray.origin, data->ray.direction,
-			               data->scale, node->index, &data->nearest);
-		}
-		else {
-			data->nearest.index = node->index;
-			data->nearest.dist_sq = calc_dist_sq_to_ray(data, node);
-			/* TODO: return a value to the data->nearest.co
-			 * not urgent however since users currently define own callbacks */
-		}
-	}
-	else {
-		int i;
-		/* First pick the closest node to dive on */
-		if (data->pick_smallest[node->main_axis]) {
-			for (i = 0; i != node->totnode; i++) {
-				if (calc_dist_sq_to_ray(data, node->children[i]) < data->nearest.dist_sq) {
-					dfs_find_nearest_to_ray_dfs(data, node->children[i]);
-				}
-			}
-		}
-		else {
-			for (i = node->totnode - 1; i >= 0; i--) {
-				if (calc_dist_sq_to_ray(data, node->children[i]) < data->nearest.dist_sq) {
-					dfs_find_nearest_to_ray_dfs(data, node->children[i]);
-				}
-			}
-		}
-	}
-}
-
-/**
- * Returns the point whose tangent defined by the angle between the point and ray is the lowest
- * nearest.dist_sq returns the angle's tangent
- */
-int BLI_bvhtree_find_nearest_to_ray_angle(
-        BVHTree *tree, const float co[3], const float dir[3],
-        const bool ray_is_normalized, const float scale[3],
-        BVHTreeNearest *nearest,
-        BVHTree_NearestToRayCallback callback, void *userdata)
-{
-	BVHNearestRayData data;
-	BVHNode *root = tree->nodes[tree->totleaf];
-
-	data.tree = tree;
-
-	data.callback = callback;
-	data.userdata = userdata;
-
-	dist_squared_ray_to_aabb_scaled_v3_precalc(&data, co, dir, ray_is_normalized, scale);
-
-	if (nearest) {
-		memcpy(&data.nearest, nearest, sizeof(*nearest));
-	}
-	else {
-		data.nearest.index = -1;
-		data.nearest.dist_sq = FLT_MAX;
-	}
-
-	/* dfs search */
-	if (root) {
-		if (calc_tangent_sq(&data, root) < data.nearest.dist_sq)
-			dfs_find_lowest_tangent_dfs(&data, root);
-	}
-
-	/* copy back results */
-	if (nearest) {
-		memcpy(nearest, &data.nearest, sizeof(*nearest));
-	}
-
-	return data.nearest.index;
-}
-
-/* return the nearest point to ray */
-int BLI_bvhtree_find_nearest_to_ray(
-        BVHTree *tree, const float co[3], const float dir[3],
-        const bool ray_is_normalized, const float scale[3],
-        BVHTreeNearest *nearest,
-        BVHTree_NearestToRayCallback callback, void *userdata)
-{
-	BVHNearestRayData data;
-	BVHNode *root = tree->nodes[tree->totleaf];
-
-	data.tree = tree;
-
-	data.callback = callback;
-	data.userdata = userdata;
-
-	dist_squared_ray_to_aabb_scaled_v3_precalc(&data, co, dir, ray_is_normalized, scale);
-
-	if (nearest) {
-		memcpy(&data.nearest, nearest, sizeof(*nearest));
-	}
-	else {
-		data.nearest.index = -1;
-		data.nearest.dist_sq = FLT_MAX;
-	}
-
-	/* dfs search */
-	if (root) {
-		if (calc_dist_sq_to_ray(&data, root) < data.nearest.dist_sq) {
-			dfs_find_nearest_to_ray_dfs(&data, root);
-		}
-	}
-
-	/* copy back results */
-	if (nearest) {
-		memcpy(nearest, &data.nearest, sizeof(*nearest));
-	}
-
-	return data.nearest.index;
-}
-
-/** \} */
-
-
-/* -------------------------------------------------------------------- */
-
 /** \name BLI_bvhtree_range_query
  *
  * Allocs and fills an array with the indexs of node that are on the given spherical range (center, radius).
diff --git a/source/blender/blenlib/intern/array_store.c b/source/blender/blenlib/intern/array_store.c
index 21ddddad32e..295b39c1a2f 100644
--- a/source/blender/blenlib/intern/array_store.c
+++ b/source/blender/blenlib/intern/array_store.c
@@ -217,15 +217,12 @@
 /** \name Internal Structs
  * \{ */
 
-typedef unsigned int  uint;
-typedef unsigned char ubyte;
-
 typedef uint64_t hash_key;
 
 
 typedef struct BArrayInfo {
 	size_t chunk_stride;
-	uint chunk_count;
+	// uint chunk_count;  /* UNUSED (other values are derived from this) */
 
 	/* pre-calculated */
 	size_t chunk_byte_size;
@@ -291,7 +288,7 @@ typedef struct BChunkList {
 
 /* a chunk of an array */
 typedef struct BChunk {
-	const ubyte *data;
+	const uchar *data;
 	size_t       data_len;
 	/** number of #BChunkList using this. */
 	int          users;
@@ -332,7 +329,7 @@ static size_t bchunk_list_size(const BChunkList *chunk_list);
  * \{ */
 
 static BChunk *bchunk_new(
-        BArrayMemory *bs_mem, const ubyte *data, const size_t data_len)
+        BArrayMemory *bs_mem, const uchar *data, const size_t data_len)
 {
 	BChunk *chunk = BLI_mempool_alloc(bs_mem->chunk);
 	chunk->data     = data;
@@ -345,9 +342,9 @@ static BChunk *bchunk_new(
 }
 
 static BChunk *bchunk_new_copydata(
-        BArrayMemory *bs_mem, const ubyte *data, const size_t data_len)
+        BArrayMemory *bs_mem, const uchar *data, const size_t data_len)
 {
-	ubyte *data_copy = MEM_mallocN(data_len, __func__);
+	uchar *data_copy = MEM_mallocN(data_len, __func__);
 	memcpy(data_copy, data, data_len);
 	return bchunk_new(bs_mem, data_copy, data_len);
 }
@@ -367,7 +364,7 @@ static void bchunk_decref(
 
 static bool bchunk_data_compare(
         const BChunk *chunk,
-        const ubyte *data_base, const size_t data_base_len,
+        const uchar *data_base, const size_t data_base_len,
         const size_t offset)
 {
 	if (offset + (size_t)chunk->data_len <= data_base_len) {
@@ -426,14 +423,14 @@ static void bchunk_list_decref(
 
 #ifdef USE_VALIDATE_LIST_DATA_PARTIAL
 static size_t bchunk_list_data_check(
-        const BChunkList *chunk_list, const ubyte *data)
+        const BChunkList *chunk_list, const uchar *data)
 {
-	size_t total_size = 0;
+	size_t offset = 0;
 	for (BChunkRef *cref = chunk_list->chunk_refs.first; cref; cref = cref->next) {
-		if (memcmp(&data[total_size], cref->link->data, cref->link->data_len) != 0) {
+		if (memcmp(&data[offset], cref->link->data, cref->link->data_len) != 0) {
 			return false;
 		}
-		total_size += cref->link->data_len;
+		offset += cref->link->data_len;
 	}
 	return true;
 }
@@ -466,7 +463,7 @@ static void bchunk_list_ensure_min_size_last(
 				chunk_list->chunk_refs.last = cref->prev;
 				chunk_list->chunk_refs_len -= 1;
 
-				ubyte *data_merge   = MEM_mallocN(data_merge_len, __func__);
+				uchar *data_merge   = MEM_mallocN(data_merge_len, __func__);
 				memcpy(data_merge,                        chunk_prev->data, chunk_prev->data_len);
 				memcpy(&data_merge[chunk_prev->data_len], chunk_curr->data, chunk_curr->data_len);
 
@@ -487,8 +484,8 @@ static void bchunk_list_ensure_min_size_last(
 				/* merge and split */
 				const size_t data_prev_len = split;
 				const size_t data_curr_len = data_merge_len - split;
-				ubyte *data_prev = MEM_mallocN(data_prev_len, __func__);
-				ubyte *data_curr = MEM_mallocN(data_curr_len, __func__);
+				uchar *data_prev = MEM_mallocN(data_prev_len, __func__);
+				uchar *data_curr = MEM_mallocN(data_curr_len, __func__);
 
 				if (data_prev_len <= chunk_prev->data_len) {
 					const size_t data_curr_shrink_len = chunk_prev->data_len - data_prev_len;
@@ -597,11 +594,10 @@ static void bchunk_list_append_only(
 static void bchunk_list_append_data(
         const BArrayInfo *info, BArrayMemory *bs_mem,
         BChunkList *chunk_list,
-        const ubyte *data, const size_t data_len)
+        const uchar *data, const size_t data_len)
 {
 	BLI_assert(data_len != 0);
 
-	// printf("data_len: %d\n", data_len);
 #ifdef USE_MERGE_CHUNKS
 	BLI_assert(data_len <= info->chunk_byte_size_max);
 
@@ -613,13 +609,13 @@ static void bchunk_list_append_data(
 			const size_t data_merge_len = chunk_prev->data_len + data_len;
 			/* realloc for single user */
 			if (cref->link->users == 1) {
-				ubyte *data_merge = MEM_reallocN((void *)cref->link->data, data_merge_len);
+				uchar *data_merge = MEM_reallocN((void *)cref->link->data, data_merge_len);
 				memcpy(&data_merge[chunk_prev->data_len], data, data_len);
 				cref->link->data     = data_merge;
 				cref->link->data_len = data_merge_len;
 			}
 			else {
-				ubyte *data_merge = MEM_mallocN(data_merge_len, __func__);
+				uchar *data_merge = MEM_mallocN(data_merge_len, __func__);
 				memcpy(data_merge, chunk_prev->data, chunk_prev->data_len);
 				memcpy(&data_merge[chunk_prev->data_len], data, data_len);
 				cref->link = bchunk_new(bs_mem, data_merge, data_merge_len);
@@ -639,7 +635,7 @@ static void bchunk_list_append_data(
 	/* don't run this, instead preemptively avoid creating a chunk only to merge it (above). */
 #if 0
 #ifdef USE_MERGE_CHUNKS
-	bchunk_list_ensure_min_size_last(info, bs_mem, chunk_list, chunk_size_min);
+	bchunk_list_ensure_min_size_last(info, bs_mem, chunk_list);
 #endif
 #endif
 }
@@ -654,7 +650,7 @@ static void bchunk_list_append_data(
 static void bchunk_list_append_data_n(
         const BArrayInfo *info, BArrayMemory *bs_mem,
         BChunkList *chunk_list,
-        const ubyte *data, size_t data_len)
+        const uchar *data, size_t data_len)
 {
 	size_t data_trim_len, data_last_chunk_len;
 	bchunk_list_calc_trim_len(info, data_len, &data_trim_len, &data_last_chunk_len);
@@ -714,7 +710,7 @@ static void bchunk_list_append(
 static void bchunk_list_fill_from_array(
         const BArrayInfo *info, BArrayMemory *bs_mem,
         BChunkList *chunk_list,
-        const ubyte *data,
+        const uchar *data,
         const size_t data_len)
 {
 	BLI_assert(BLI_listbase_is_empty(&chunk_list->chunk_refs));
@@ -765,13 +761,13 @@ static void bchunk_list_fill_from_array(
 
 #define HASH_INIT (5381)
 
-BLI_INLINE uint hash_data_single(const ubyte p)
+BLI_INLINE uint hash_data_single(const uchar p)
 {
 	return (HASH_INIT << 5) + HASH_INIT + (unsigned int)p;
 }
 
 /* hash bytes, from BLI_ghashutil_strhash_n */
-static uint hash_data(const ubyte *key, size_t n)
+static uint hash_data(const uchar *key, size_t n)
 {
 	const signed char *p;
 	unsigned int h = HASH_INIT;
@@ -788,7 +784,7 @@ static uint hash_data(const ubyte *key, size_t n)
 
 #ifdef USE_HASH_TABLE_ACCUMULATE
 static void hash_array_from_data(
-        const BArrayInfo *info, const ubyte *data_slice, const size_t data_slice_len,
+        const BArrayInfo *info, const uchar *data_slice, const size_t data_slice_len,
         hash_key *hash_array)
 {
 	if (info->chunk_stride != 1) {
@@ -877,7 +873,7 @@ static void hash_accum_single(hash_key *hash_array, const size_t hash_array_len,
 
 static hash_key key_from_chunk_ref(
         const BArrayInfo *info, const BChunkRef *cref,
-        /* avoid reallicating each time */
+        /* avoid reallocating each time */
         hash_key *hash_store, const size_t hash_store_len)
 {
 	/* in C, will fill in a reusable array */
@@ -899,7 +895,7 @@ static hash_key key_from_chunk_ref(
 			key = hash_store[0];
 
 			/* cache the key */
-			if (key == HASH_TABLE_KEY_UNSET) {
+			if (UNLIKELY(key == HASH_TABLE_KEY_UNSET)) {
 				key = HASH_TABLE_KEY_FALLBACK;
 			}
 			chunk->key = key;
@@ -929,12 +925,12 @@ static hash_key key_from_chunk_ref(
 
 static const BChunkRef *table_lookup(
         const BArrayInfo *info, BTableRef **table, const size_t table_len, const size_t i_table_start,
-        const ubyte *data, const size_t data_len, const size_t offset, const hash_key *table_hash_array)
+        const uchar *data, const size_t data_len, const size_t offset, const hash_key *table_hash_array)
 {
 	size_t size_left = data_len - offset;
 	hash_key key = table_hash_array[((offset - i_table_start) / info->chunk_stride)];
 	size_t key_index = (size_t)(key % (hash_key)table_len);
-	for (BTableRef *tref = table[key_index]; tref; tref = tref->next) {
+	for (const BTableRef *tref = table[key_index]; tref; tref = tref->next) {
 		const BChunkRef *cref = tref->cref;
 #ifdef USE_HASH_TABLE_KEY_CACHE
 		if (cref->link->key == key)
@@ -985,7 +981,7 @@ static hash_key key_from_chunk_ref(const BArrayInfo *info, const BChunkRef *cref
 
 static const BChunkRef *table_lookup(
         const BArrayInfo *info, BTableRef **table, const size_t table_len, const uint UNUSED(i_table_start),
-        const ubyte *data, const size_t data_len, const size_t offset, const hash_key *UNUSED(table_hash_array))
+        const uchar *data, const size_t data_len, const size_t offset, const hash_key *UNUSED(table_hash_array))
 {
 	const size_t data_hash_len = BCHUNK_HASH_LEN * info->chunk_stride;  /* TODO, cache */
 
@@ -1025,7 +1021,7 @@ static const BChunkRef *table_lookup(
  */
 static BChunkList *bchunk_list_from_data_merge(
         const BArrayInfo *info, BArrayMemory *bs_mem,
-        const ubyte *data, const size_t data_len_original,
+        const uchar *data, const size_t data_len_original,
         const BChunkList *chunk_list_reference)
 {
 	ASSERT_CHUNKLIST_SIZE(chunk_list_reference, chunk_list_reference->total_size);
@@ -1042,10 +1038,8 @@ static BChunkList *bchunk_list_from_data_merge(
 	size_t i_prev = 0;
 
 #ifdef USE_FASTPATH_CHUNKS_FIRST
-	bool full_match = false;
-
 	{
-		full_match = true;
+		bool full_match = true;
 
 		const BChunkRef *cref = chunk_list_reference->chunk_refs.first;
 		while (i_prev < data_len_original) {
@@ -1433,7 +1427,7 @@ BArrayStore *BLI_array_store_create(
 	BArrayStore *bs = MEM_callocN(sizeof(BArrayStore), __func__);
 
 	bs->info.chunk_stride = stride;
-	bs->info.chunk_count = chunk_count;
+	// bs->info.chunk_count = chunk_count;
 
 	bs->info.chunk_byte_size = chunk_count * stride;
 #ifdef USE_MERGE_CHUNKS
@@ -1579,7 +1573,7 @@ BArrayState *BLI_array_store_state_add(
 	if (state_reference) {
 		chunk_list = bchunk_list_from_data_merge(
 		        &bs->info, &bs->memory,
-		        (const ubyte *)data, data_len,
+		        (const uchar *)data, data_len,
 		        /* re-use reference chunks */
 		        state_reference->chunk_list);
 	}
@@ -1588,7 +1582,7 @@ BArrayState *BLI_array_store_state_add(
 		bchunk_list_fill_from_array(
 		        &bs->info, &bs->memory,
 		        chunk_list,
-		        (const ubyte *)data, data_len);
+		        (const uchar *)data, data_len);
 	}
 
 	chunk_list->users += 1;
@@ -1655,7 +1649,7 @@ void BLI_array_store_state_data_get(
 	BLI_assert(data_test_len == state->chunk_list->total_size);
 #endif
 
-	ubyte *data_step = (ubyte *)data;
+	uchar *data_step = (uchar *)data;
 	for (BChunkRef *cref = state->chunk_list->chunk_refs.first; cref; cref = cref->next) {
 		BLI_assert(cref->link->users > 0);
 		memcpy(data_step, cref->link->data, cref->link->data_len);
diff --git a/source/blender/blenlib/intern/math_geom.c b/source/blender/blenlib/intern/math_geom.c
index aeb6a550cd9..58699a0593b 100644
--- a/source/blender/blenlib/intern/math_geom.c
+++ b/source/blender/blenlib/intern/math_geom.c
@@ -2337,224 +2337,6 @@ bool isect_ray_aabb_v3_simple(
 	}
 }
 
-void dist_squared_ray_to_aabb_v3_precalc(
-        struct NearestRayToAABB_Precalc *data,
-        const float ray_origin[3], const float ray_direction[3])
-{
-	float dir_sq[3];
-
-	for (int i = 0; i < 3; i++) {
-		data->ray_origin[i] = ray_origin[i];
-		data->ray_direction[i] = ray_direction[i];
-		data->ray_inv_dir[i] = (data->ray_direction[i] != 0.0f) ? (1.0f / data->ray_direction[i]) : FLT_MAX;
-		/* It has to be a function of `ray_inv_dir`,
-		 * since the division of 1 by 0.0f, can be -inf or +inf */
-		data->sign[i] = (data->ray_inv_dir[i] < 0.0f);
-
-		dir_sq[i] = SQUARE(data->ray_direction[i]);
-	}
-
-	/* `diag_sq` Length square of each face diagonal */
-	float diag_sq[3] = {
-		dir_sq[1] + dir_sq[2],
-		dir_sq[0] + dir_sq[2],
-		dir_sq[0] + dir_sq[1],
-	};
-	data->idiag_sq[0] = (diag_sq[0] > FLT_EPSILON) ? (1.0f / diag_sq[0]) : FLT_MAX;
-	data->idiag_sq[1] = (diag_sq[1] > FLT_EPSILON) ? (1.0f / diag_sq[1]) : FLT_MAX;
-	data->idiag_sq[2] = (diag_sq[2] > FLT_EPSILON) ? (1.0f / diag_sq[2]) : FLT_MAX;
-
-	data->cdot_axis[0] = data->ray_direction[0] * data->idiag_sq[0];
-	data->cdot_axis[1] = data->ray_direction[1] * data->idiag_sq[1];
-	data->cdot_axis[2] = data->ray_direction[2] * data->idiag_sq[2];
-}
-
-/**
- * Returns the squared distance from a ray to a bound-box `AABB`.
- * It is based on `fast_ray_nearest_hit` solution to obtain
- * the coordinates of the nearest edge of Bound Box to the ray
- */
-float dist_squared_ray_to_aabb_v3(
-        const struct NearestRayToAABB_Precalc *data,
-        const float bb_min[3], const float bb_max[3],
-        bool r_axis_closest[3])
-{
-	/* `tmin` is a vector that has the smaller distances to each of the
-	 * infinite planes of the `AABB` faces (hit in nearest face X plane,
-	 * nearest face Y plane and nearest face Z plane) */
-	float local_bvmin[3], local_bvmax[3];
-
-	if (data->sign[0] == 0) {
-		local_bvmin[0] = bb_min[0] - data->ray_origin[0];
-		local_bvmax[0] = bb_max[0] - data->ray_origin[0];
-	}
-	else {
-		local_bvmin[0] = bb_max[0] - data->ray_origin[0];
-		local_bvmax[0] = bb_min[0] - data->ray_origin[0];
-	}
-
-	if (data->sign[1] == 0) {
-		local_bvmin[1] = bb_min[1] - data->ray_origin[1];
-		local_bvmax[1] = bb_max[1] - data->ray_origin[1];
-	}
-	else {
-		local_bvmin[1] = bb_max[1] - data->ray_origin[1];
-		local_bvmax[1] = bb_min[1] - data->ray_origin[1];
-	}
-
-	if (data->sign[2] == 0) {
-		local_bvmin[2] = bb_min[2] - data->ray_origin[2];
-		local_bvmax[2] = bb_max[2] - data->ray_origin[2];
-	}
-	else {
-		local_bvmin[2] = bb_max[2] - data->ray_origin[2];
-		local_bvmax[2] = bb_min[2] - data->ray_origin[2];
-	}
-
-	const float tmin[3] = {
-		local_bvmin[0] * data->ray_inv_dir[0],
-		local_bvmin[1] * data->ray_inv_dir[1],
-		local_bvmin[2] * data->ray_inv_dir[2],
-	};
-
-	/* `tmax` is a vector that has the longer distances to each of the
-	 * infinite planes of the `AABB` faces (hit in farthest face X plane,
-	 * farthest face Y plane and farthest face Z plane) */
-	const float tmax[3] = {
-		local_bvmax[0] * data->ray_inv_dir[0],
-		local_bvmax[1] * data->ray_inv_dir[1],
-		local_bvmax[2] * data->ray_inv_dir[2],
-	};
-	/* `v1` and `v3` is be the coordinates of the nearest `AABB` edge to the ray*/
-	float v1[3], v2[3];
-	/* `rtmin` is the highest value of the smaller distances. == max_axis_v3(tmin)
-	 * `rtmax` is the lowest value of longer distances. == min_axis_v3(tmax)*/
-	float rtmin, rtmax, mul, rdist;
-	/* `main_axis` is the axis equivalent to edge close to the ray */
-	int main_axis;
-
-	r_axis_closest[0] = false;
-	r_axis_closest[1] = false;
-	r_axis_closest[2] = false;
-
-	/* *** min_axis_v3(tmax) *** */
-	if ((tmax[0] <= tmax[1]) && (tmax[0] <= tmax[2])) {
-		// printf("# Hit in X %s\n", data->sign[0] ? "min", "max");
-		rtmax = tmax[0];
-		v1[0] = v2[0] = local_bvmax[0];
-		mul = local_bvmax[0] * data->ray_direction[0];
-		main_axis = 3;
-		r_axis_closest[0] = data->sign[0];
-	}
-	else if ((tmax[1] <= tmax[0]) && (tmax[1] <= tmax[2])) {
-		// printf("# Hit in Y %s\n", data->sign[1] ? "min", "max");
-		rtmax = tmax[1];
-		v1[1] = v2[1] = local_bvmax[1];
-		mul = local_bvmax[1] * data->ray_direction[1];
-		main_axis = 2;
-		r_axis_closest[1] = data->sign[1];
-	}
-	else {
-		// printf("# Hit in Z %s\n", data->sign[2] ? "min", "max");
-		rtmax = tmax[2];
-		v1[2] = v2[2] = local_bvmax[2];
-		mul = local_bvmax[2] * data->ray_direction[2];
-		main_axis = 1;
-		r_axis_closest[2] = data->sign[2];
-	}
-
-	/* *** max_axis_v3(tmin) *** */
-	if ((tmin[0] >= tmin[1]) && (tmin[0] >= tmin[2])) {
-		// printf("# To X %s\n", data->sign[0] ? "max", "min");
-		rtmin = tmin[0];
-		v1[0] = v2[0] = local_bvmin[0];
-		mul += local_bvmin[0] * data->ray_direction[0];
-		main_axis -= 3;
-		r_axis_closest[0] = !data->sign[0];
-	}
-	else if ((tmin[1] >= tmin[0]) && (tmin[1] >= tmin[2])) {
-		// printf("# To Y %s\n", data->sign[1] ? "max", "min");
-		rtmin = tmin[1];
-		v1[1] = v2[1] = local_bvmin[1];
-		mul += local_bvmin[1] * data->ray_direction[1];
-		main_axis -= 1;
-		r_axis_closest[1] = !data->sign[1];
-	}
-	else {
-		// printf("# To Z %s\n", data->sign[2] ? "max", "min");
-		rtmin = tmin[2];
-		v1[2] = v2[2] = local_bvmin[2];
-		mul += local_bvmin[2] * data->ray_direction[2];
-		main_axis -= 2;
-		r_axis_closest[2] = !data->sign[2];
-	}
-	/* *** end min/max axis *** */
-
-
-	/* `if rtmax < 0`, the whole `AABB` is behing us */
-	if ((rtmax < 0.0f) && (rtmin < 0.0f)) {
-		return FLT_MAX;
-	}
-
-	if (main_axis < 0) {
-		main_axis += 3;
-	}
-
-	if (data->sign[main_axis] == 0) {
-		v1[main_axis] = local_bvmin[main_axis];
-		v2[main_axis] = local_bvmax[main_axis];
-	}
-	else {
-		v1[main_axis] = local_bvmax[main_axis];
-		v2[main_axis] = local_bvmin[main_axis];
-	}
-
-	/* if rtmin < rtmax, ray intersect `AABB` */
-	if (rtmin <= rtmax) {
-		const float proj = rtmin * data->ray_direction[main_axis];
-		rdist = 0.0f;
-		r_axis_closest[main_axis] = (proj - v1[main_axis]) < (v2[main_axis] - proj);
-	}
-	else {
-		/* `proj` equals to nearest point on the ray closest to the edge `v1 v2` of the `AABB`. */
-		const float proj = mul * data->cdot_axis[main_axis];
-		float depth;
-		if (v1[main_axis] > proj) {  /* the nearest point to the ray is the point v1 */
-			/* `depth` is equivalent the distance from the origin to the point v1,
-			 * Here's a faster way to calculate the dot product of v1 and ray
-			 * (depth = dot_v3v3(v1, data->ray.direction))*/
-			depth = mul + data->ray_direction[main_axis] * v1[main_axis];
-			rdist = len_squared_v3(v1) - SQUARE(depth);
-			r_axis_closest[main_axis] = true;
-		}
-		else if (v2[main_axis] < proj) {  /* the nearest point of the ray is the point v2 */
-			depth = mul + data->ray_direction[main_axis] * v2[main_axis];
-			rdist = len_squared_v3(v2) - SQUARE(depth);
-			r_axis_closest[main_axis] = false;
-		}
-		else {  /* the nearest point of the ray is on the edge of the `AABB`. */
-			float v[2];
-			mul *= data->idiag_sq[main_axis];
-			if (main_axis == 0) {
-				v[0] = (mul * data->ray_direction[1]) - v1[1];
-				v[1] = (mul * data->ray_direction[2]) - v1[2];
-			}
-			else if (main_axis == 1) {
-				v[0] = (mul * data->ray_direction[0]) - v1[0];
-				v[1] = (mul * data->ray_direction[2]) - v1[2];
-			}
-			else {
-				v[0] = (mul * data->ray_direction[0]) - v1[0];
-				v[1] = (mul * data->ray_direction[1]) - v1[1];
-			}
-			rdist = len_squared_v2(v);
-			r_axis_closest[main_axis] = (proj - v1[main_axis]) < (v2[main_axis] - proj);
-		}
-	}
-
-	return rdist;
-}
-
 /* find closest point to p on line through (l1, l2) and return lambda,
  * where (0 <= lambda <= 1) when cp is in the line segment (l1, l2)
  */
@@ -3069,6 +2851,9 @@ bool barycentric_coords_v2(const float v1[2], const float v2[2], const float v3[
 
 /**
  * \note: using #cross_tri_v2 means locations outside the triangle are correctly weighted
+ *
+ * \note This is *exactly* the same calculation as #resolve_tri_uv_v2,
+ * although it has double precision and is used for texture baking, so keep both.
  */
 void barycentric_weights_v2(const float v1[2], const float v2[2], const float v3[2], const float co[2], float w[3])
 {
@@ -3108,9 +2893,11 @@ void barycentric_weights_v2_persp(const float v1[4], const float v2[4], const fl
 	}
 }
 
-/* same as #barycentric_weights_v2 but works with a quad,
+/**
+ * same as #barycentric_weights_v2 but works with a quad,
  * note: untested for values outside the quad's bounds
- * this is #interp_weights_poly_v2 expanded for quads only */
+ * this is #interp_weights_poly_v2 expanded for quads only
+ */
 void barycentric_weights_v2_quad(const float v1[2], const float v2[2], const float v3[2], const float v4[2],
                                  const float co[2], float w[4])
 {
@@ -3563,6 +3350,8 @@ void interp_cubic_v3(float x[3], float v[3], const float x1[3], const float v1[3
  * Barycentric reverse
  *
  * Compute coordinates (u, v) for point \a st with respect to triangle (\a st0, \a st1, \a st2)
+ *
+ * \note same basic result as #barycentric_weights_v2, see it's comment for details.
  */
 void resolve_tri_uv_v2(float r_uv[2], const float st[2],
                        const float st0[2], const float st1[2], const float st2[2])
diff --git a/source/blender/blenlib/intern/path_util.c b/source/blender/blenlib/intern/path_util.c
index 6644e6605a1..7b765cfa939 100644
--- a/source/blender/blenlib/intern/path_util.c
+++ b/source/blender/blenlib/intern/path_util.c
@@ -1167,30 +1167,6 @@ bool BLI_path_program_search(
 }
 
 /**
- * Copies into *last the part of *dir following the second-last slash.
- */
-void BLI_getlastdir(const char *dir, char *last, const size_t maxlen)
-{
-	const char *s = dir;
-	const char *lslash = NULL;
-	const char *prevslash = NULL;
-	while (*s) {
-		if ((*s == '\\') || (*s == '/')) {
-			prevslash = lslash;
-			lslash = s;
-		}
-		s++;
-	}
-	if (prevslash) {
-		BLI_strncpy(last, prevslash + 1, maxlen);
-	}
-	else {
-		BLI_strncpy(last, dir, maxlen);
-	}
-}
-
-
-/**
  * Sets the specified environment variable to the specified value,
  * and clears it if val == NULL.
  */
@@ -1615,6 +1591,90 @@ void BLI_join_dirfile(char *__restrict dst, const size_t maxlen, const char *__r
 }
 
 /**
+ * Join multiple strings into a path, ensuring only a single path separator between each,
+ * and trailing slash is kept.
+ *
+ * \note If you want a trailing slash, add ``SEP_STR`` as the last path argument,
+ * duplicate slashes will be cleaned up.
+ */
+size_t BLI_path_join(char *__restrict dst, const size_t dst_len, const char *path, ...)
+{
+	if (UNLIKELY(dst_len == 0)) {
+		return 0;
+	}
+	const size_t dst_last = dst_len - 1;
+	size_t ofs = BLI_strncpy_rlen(dst, path, dst_len);
+
+	if (ofs == dst_last) {
+		return ofs;
+	}
+
+	/* remove trailing slashes, unless there are _only_ trailing slashes
+	 * (allow "//" as the first argument). */
+	bool has_trailing_slash = false;
+	if (ofs != 0) {
+		size_t len = ofs;
+		while ((len != 0) && ELEM(path[len - 1], SEP, ALTSEP)) {
+			len -= 1;
+		}
+		if (len != 0) {
+			ofs = len;
+		}
+		has_trailing_slash = (path[len] != '\0');
+	}
+
+	va_list args;
+	va_start(args, path);
+	while ((path = (const char *) va_arg(args, const char *))) {
+		has_trailing_slash = false;
+		const char *path_init = path;
+		while (ELEM(path[0], SEP, ALTSEP)) {
+			path++;
+		}
+		size_t len = strlen(path);
+		if (len != 0) {
+			while ((len != 0) && ELEM(path[len - 1], SEP, ALTSEP)) {
+				len -= 1;
+			}
+
+			if (len != 0) {
+				/* the very first path may have a slash at the end */
+				if (ofs && !ELEM(dst[ofs - 1], SEP, ALTSEP)) {
+					dst[ofs++] = SEP;
+					if (ofs == dst_last) {
+						break;
+					}
+				}
+				has_trailing_slash = (path[len] != '\0');
+				if (ofs + len >= dst_last) {
+					len = dst_last - ofs;
+				}
+				memcpy(&dst[ofs], path, len);
+				ofs += len;
+				if (ofs == dst_last) {
+					break;
+				}
+			}
+		}
+		else {
+			has_trailing_slash = (path_init != path);
+		}
+	}
+	va_end(args);
+
+	if (has_trailing_slash) {
+		if ((ofs != dst_last) && (ofs != 0) && (ELEM(dst[ofs - 1], SEP, ALTSEP) == 0)) {
+			dst[ofs++] = SEP;
+		}
+	}
+
+	BLI_assert(ofs <= dst_last);
+	dst[ofs] = '\0';
+
+	return ofs;
+}
+
+/**
  * like pythons os.path.basename()
  *
  * \return The pointer into \a path string immediately after last slash,
@@ -1626,6 +1686,71 @@ const char *BLI_path_basename(const char *path)
 	return filename ? filename + 1 : path;
 }
 
+/**
+ * Get an element of the path at an index, eg:
+ * "/some/path/file.txt" where an index of...
+ * - 0 or -3: "some"
+ * - 1 or -2: "path"
+ * - 2 or -1: "file.txt"
+ *
+ * Ignores multiple slashes at any point in the path (including start/end).
+ */
+bool BLI_path_name_at_index(const char *path, const int index, int *r_offset, int *r_len)
+{
+	if (index >= 0) {
+		int index_step = 0;
+		int prev = -1;
+		int i = 0;
+		while (true) {
+			const char c = path[i];
+			if (ELEM(c, SEP, ALTSEP, '\0')) {
+				if (prev + 1 != i) {
+					prev += 1;
+					if (index_step == index) {
+						*r_offset = prev;
+						*r_len = i - prev;
+						/* printf("!!! %d %d\n", start, end); */
+						return true;
+					}
+					index_step += 1;
+				}
+				if (c == '\0') {
+					break;
+				}
+				prev = i;
+			}
+			i += 1;
+		}
+		return false;
+	}
+	else {
+		/* negative number, reverse where -1 is the last element */
+		int index_step = -1;
+		int prev = strlen(path);
+		int i = prev - 1;
+		while (true) {
+			const char c = i >= 0 ? path[i] : '\0';
+			if (ELEM(c, SEP, ALTSEP, '\0')) {
+				if (prev - 1 != i) {
+					i += 1;
+					if (index_step == index) {
+						*r_offset = i;
+						*r_len = prev - i;
+						return true;
+					}
+					index_step -= 1;
+				}
+				if (c == '\0') {
+					break;
+				}
+				prev = i;
+			}
+			i -= 1;
+		}
+		return false;
+	}
+}
+
 /* UNUSED */
 #if 0
 /**
diff --git a/source/blender/blenlib/intern/polyfill2d.c b/source/blender/blenlib/intern/polyfill2d.c
index 8d9881e4539..2969b0eccf4 100644
--- a/source/blender/blenlib/intern/polyfill2d.c
+++ b/source/blender/blenlib/intern/polyfill2d.c
@@ -21,8 +21,15 @@
 /** \file blender/blenlib/intern/polyfill2d.c
  *  \ingroup bli
  *
- * A simple implementation of the ear cutting algorithm
- * to triangulate simple polygons without holes.
+ * An ear clipping algorithm to triangulate single boundary polygons.
+ *
+ * Details:
+ *
+ * - The algorithm guarantees all triangles are assigned (number of coords - 2)
+ *   and that triangles will have non-overlapping indices (even for degenerate geometry).
+ * - Self-intersections are considered degenerate (resulting triangles will overlap).
+ * - While multiple polygons aren't supported, holes can still be defined using *key-holes*
+ *   (where the polygon doubles back on its self with *exactly* matching coordinates).
  *
  * \note
  *
@@ -74,6 +81,12 @@ typedef signed char eSign;
 
 #ifdef USE_KDTREE
 /**
+ * Spatial optimization for point-in-triangle intersection checks.
+ * The simple version of this algorithm is ``O(n^2)`` complexity
+ * (every point needing to check the triangle defined by every other point),
+ * Using a binary-tree reduces the complexity to ``O(n log n)``
+ * plus some overhead of creating the tree.
+ *
  * This is a single purpose KDTree based on BLI_kdtree with some modifications
  * to better suit polyfill2d.
  *
diff --git a/source/blender/blenlib/intern/rct.c b/source/blender/blenlib/intern/rct.c
index ac73a981b45..fd24a00156d 100644
--- a/source/blender/blenlib/intern/rct.c
+++ b/source/blender/blenlib/intern/rct.c
@@ -351,6 +351,22 @@ void BLI_rcti_init(rcti *rect, int xmin, int xmax, int ymin, int ymax)
 	}
 }
 
+void BLI_rctf_init_pt_radius(rctf *rect, const float xy[2], float size)
+{
+	rect->xmin = xy[0] - size;
+	rect->xmax = xy[0] + size;
+	rect->ymin = xy[1] - size;
+	rect->ymax = xy[1] + size;
+}
+
+void BLI_rcti_init_pt_radius(rcti *rect, const int xy[2], int size)
+{
+	rect->xmin = xy[0] - size;
+	rect->xmax = xy[0] + size;
+	rect->ymin = xy[1] - size;
+	rect->ymax = xy[1] + size;
+}
+
 void BLI_rcti_init_minmax(rcti *rect)
 {
 	rect->xmin = rect->ymin = INT_MAX;
diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c
index fc2d9674c2f..17e20f8fa18 100644
--- a/source/blender/blenlib/intern/task.c
+++ b/source/blender/blenlib/intern/task.c
@@ -48,6 +48,32 @@
  */
 #define MEMPOOL_SIZE 256
 
+/* Number of tasks which are pushed directly to local thread queue.
+ *
+ * This allows thread to fetch next task without locking the whole queue.
+ */
+#define LOCALQUEUE_SIZE 1
+
+#ifndef NDEBUG
+#  define ASSERT_THREAD_ID(scheduler, thread_id)                              \
+	do {                                                                      \
+		if (!BLI_thread_is_main()) {                                          \
+			TaskThread *thread = pthread_getspecific(scheduler->tls_id_key);  \
+			if (thread == NULL) {                                             \
+				BLI_assert(thread_id == 0);                                   \
+			}                                                                 \
+			else {                                                            \
+				BLI_assert(thread_id == thread->id);                          \
+			}                                                                 \
+		}                                                                     \
+		else {                                                                \
+			BLI_assert(thread_id == 0);                                       \
+		}                                                                     \
+	} while (false)
+#else
+#  define ASSERT_THREAD_ID(scheduler, thread_id)
+#endif
+
 typedef struct Task {
 	struct Task *next, *prev;
 
@@ -102,13 +128,16 @@ typedef struct TaskMemPoolStats {
 } TaskMemPoolStats;
 #endif
 
+typedef struct TaskThreadLocalStorage {
+	TaskMemPool task_mempool;
+	int num_local_queue;
+	Task *local_queue[LOCALQUEUE_SIZE];
+} TaskThreadLocalStorage;
+
 struct TaskPool {
 	TaskScheduler *scheduler;
 
 	volatile size_t num;
-	volatile size_t done;
-	size_t num_threads;
-	size_t currently_running_tasks;
 	ThreadMutex num_mutex;
 	ThreadCondition num_cond;
 
@@ -116,6 +145,11 @@ struct TaskPool {
 	ThreadMutex user_mutex;
 
 	volatile bool do_cancel;
+	volatile bool do_work;
+
+	volatile bool is_suspended;
+	ListBase suspended_queue;
+	size_t num_suspended;
 
 	/* If set, this pool may never be work_and_wait'ed, which means TaskScheduler
 	 * has to use its special background fallback thread in case we are in
@@ -123,16 +157,10 @@ struct TaskPool {
 	 */
 	bool run_in_background;
 
-	/* This pool is used for caching task pointers for thread id 0.
-	 * This could either point to a global scheduler's task_mempool[0] if the
-	 * pool is handled form the main thread or point to task_mempool_local
-	 * otherwise.
-	 *
-	 * This way we solve possible threading conflicts accessing same global
-	 * memory pool from multiple threads from which wait_work() is called.
+	/* This is a task scheduler's ID of a thread at which pool was constructed.
+	 * It will be used to access task TLS.
 	 */
-	TaskMemPool *task_mempool;
-	TaskMemPool task_mempool_local;
+	int thread_id;
 
 #ifdef DEBUG_STATS
 	TaskMemPoolStats *mempool_stats;
@@ -142,7 +170,6 @@ struct TaskPool {
 struct TaskScheduler {
 	pthread_t *threads;
 	struct TaskThread *task_threads;
-	TaskMemPool *task_mempool;
 	int num_threads;
 	bool background_thread_only;
 
@@ -151,15 +178,19 @@ struct TaskScheduler {
 	ThreadCondition queue_cond;
 
 	volatile bool do_exit;
+
+	/* NOTE: In pthread's TLS we store the whole TaskThread structure. */
+	pthread_key_t tls_id_key;
 };
 
 typedef struct TaskThread {
 	TaskScheduler *scheduler;
 	int id;
+	TaskThreadLocalStorage tls;
 } TaskThread;
 
 /* Helper */
-static void task_data_free(Task *task, const int thread_id)
+BLI_INLINE void task_data_free(Task *task, const int thread_id)
 {
 	if (task->free_taskdata) {
 		if (task->freedata) {
@@ -171,28 +202,42 @@ static void task_data_free(Task *task, const int thread_id)
 	}
 }
 
-BLI_INLINE TaskMemPool *get_task_mempool(TaskPool *pool, const int thread_id)
+BLI_INLINE TaskThreadLocalStorage *get_task_tls(TaskPool *pool,
+                                                const int thread_id)
 {
+	TaskScheduler *scheduler = pool->scheduler;
+	BLI_assert(thread_id >= 0);
+	BLI_assert(thread_id <= scheduler->num_threads);
 	if (thread_id == 0) {
-		return pool->task_mempool;
+		return &scheduler->task_threads[pool->thread_id].tls;
+	}
+	return &scheduler->task_threads[thread_id].tls;
+}
+
+BLI_INLINE void free_task_tls(TaskThreadLocalStorage *tls)
+{
+	TaskMemPool *task_mempool = &tls->task_mempool;
+	for (int i = 0; i < task_mempool->num_tasks; ++i) {
+		MEM_freeN(task_mempool->tasks[i]);
 	}
-	return &pool->scheduler->task_mempool[thread_id];
 }
 
 static Task *task_alloc(TaskPool *pool, const int thread_id)
 {
-	assert(thread_id <= pool->scheduler->num_threads);
+	BLI_assert(thread_id <= pool->scheduler->num_threads);
 	if (thread_id != -1) {
-		assert(thread_id >= 0);
-		TaskMemPool *mem_pool = get_task_mempool(pool, thread_id);
+		BLI_assert(thread_id >= 0);
+		BLI_assert(thread_id <= pool->scheduler->num_threads);
+		TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+		TaskMemPool *task_mempool = &tls->task_mempool;
 		/* Try to re-use task memory from a thread local storage. */
-		if (mem_pool->num_tasks > 0) {
-			--mem_pool->num_tasks;
+		if (task_mempool->num_tasks > 0) {
+			--task_mempool->num_tasks;
 			/* Success! We've just avoided task allocation. */
 #ifdef DEBUG_STATS
 			pool->mempool_stats[thread_id].num_reuse++;
 #endif
-			return mem_pool->tasks[mem_pool->num_tasks];
+			return task_mempool->tasks[task_mempool->num_tasks];
 		}
 		/* We are doomed to allocate new task data. */
 #ifdef DEBUG_STATS
@@ -205,13 +250,14 @@ static Task *task_alloc(TaskPool *pool, const int thread_id)
 static void task_free(TaskPool *pool, Task *task, const int thread_id)
 {
 	task_data_free(task, thread_id);
-	assert(thread_id >= 0);
-	assert(thread_id <= pool->scheduler->num_threads);
-	TaskMemPool *mem_pool = get_task_mempool(pool, thread_id);
-	if (mem_pool->num_tasks < MEMPOOL_SIZE - 1) {
+	BLI_assert(thread_id >= 0);
+	BLI_assert(thread_id <= pool->scheduler->num_threads);
+	TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+	TaskMemPool *task_mempool = &tls->task_mempool;
+	if (task_mempool->num_tasks < MEMPOOL_SIZE - 1) {
 		/* Successfully allowed the task to be re-used later. */
-		mem_pool->tasks[mem_pool->num_tasks] = task;
-		++mem_pool->num_tasks;
+		task_mempool->tasks[task_mempool->num_tasks] = task;
+		++task_mempool->num_tasks;
 	}
 	else {
 		/* Local storage saturated, no other way than just discard
@@ -237,8 +283,6 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done)
 	BLI_assert(pool->num >= done);
 
 	pool->num -= done;
-	atomic_sub_and_fetch_z(&pool->currently_running_tasks, done);
-	pool->done += done;
 
 	if (pool->num == 0)
 		BLI_condition_notify_all(&pool->num_cond);
@@ -246,11 +290,11 @@ static void task_pool_num_decrease(TaskPool *pool, size_t done)
 	BLI_mutex_unlock(&pool->num_mutex);
 }
 
-static void task_pool_num_increase(TaskPool *pool)
+static void task_pool_num_increase(TaskPool *pool, size_t new)
 {
 	BLI_mutex_lock(&pool->num_mutex);
 
-	pool->num++;
+	pool->num += new;
 	BLI_condition_notify_all(&pool->num_cond);
 
 	BLI_mutex_unlock(&pool->num_mutex);
@@ -292,17 +336,10 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task
 				continue;
 			}
 
-			if (atomic_add_and_fetch_z(&pool->currently_running_tasks, 1) <= pool->num_threads ||
-			    pool->num_threads == 0)
-			{
-				*task = current_task;
-				found_task = true;
-				BLI_remlink(&scheduler->queue, *task);
-				break;
-			}
-			else {
-				atomic_sub_and_fetch_z(&pool->currently_running_tasks, 1);
-			}
+			*task = current_task;
+			found_task = true;
+			BLI_remlink(&scheduler->queue, *task);
+			break;
 		}
 		if (!found_task)
 			BLI_condition_wait(&scheduler->queue_cond, &scheduler->queue_mutex);
@@ -313,13 +350,34 @@ static bool task_scheduler_thread_wait_pop(TaskScheduler *scheduler, Task **task
 	return true;
 }
 
+BLI_INLINE void handle_local_queue(TaskThreadLocalStorage *tls,
+                                   const int thread_id)
+{
+	while (tls->num_local_queue > 0) {
+		/* We pop task from queue before handling it so handler of the task can
+		 * push next job to the local queue.
+		 */
+		tls->num_local_queue--;
+		Task *local_task = tls->local_queue[tls->num_local_queue];
+		/* TODO(sergey): Double-check work_and_wait() doesn't handle other's
+		 * pool tasks.
+		 */
+		TaskPool *local_pool = local_task->pool;
+		local_task->run(local_pool, local_task->taskdata, thread_id);
+		task_free(local_pool, local_task, thread_id);
+	}
+}
+
 static void *task_scheduler_thread_run(void *thread_p)
 {
 	TaskThread *thread = (TaskThread *) thread_p;
+	TaskThreadLocalStorage *tls = &thread->tls;
 	TaskScheduler *scheduler = thread->scheduler;
 	int thread_id = thread->id;
 	Task *task;
 
+	pthread_setspecific(scheduler->tls_id_key, thread);
+
 	/* keep popping off tasks */
 	while (task_scheduler_thread_wait_pop(scheduler, &task)) {
 		TaskPool *pool = task->pool;
@@ -330,6 +388,9 @@ static void *task_scheduler_thread_run(void *thread_p)
 		/* delete task */
 		task_free(pool, task, thread_id);
 
+		/* Handle all tasks from local queue. */
+		handle_local_queue(tls, thread_id);
+
 		/* notify pool task was done */
 		task_pool_num_decrease(pool, 1);
 	}
@@ -359,20 +420,24 @@ TaskScheduler *BLI_task_scheduler_create(int num_threads)
 
 	/* Add background-only thread if needed. */
 	if (num_threads == 0) {
-	    scheduler->background_thread_only = true;
-	    num_threads = 1;
+		scheduler->background_thread_only = true;
+		num_threads = 1;
 	}
 
+	scheduler->task_threads = MEM_callocN(sizeof(TaskThread) * (num_threads + 1),
+	                                      "TaskScheduler task threads");
+
+	pthread_key_create(&scheduler->tls_id_key, NULL);
+
 	/* launch threads that will be waiting for work */
 	if (num_threads > 0) {
 		int i;
 
 		scheduler->num_threads = num_threads;
 		scheduler->threads = MEM_callocN(sizeof(pthread_t) * num_threads, "TaskScheduler threads");
-		scheduler->task_threads = MEM_callocN(sizeof(TaskThread) * num_threads, "TaskScheduler task threads");
 
 		for (i = 0; i < num_threads; i++) {
-			TaskThread *thread = &scheduler->task_threads[i];
+			TaskThread *thread = &scheduler->task_threads[i + 1];
 			thread->scheduler = scheduler;
 			thread->id = i + 1;
 
@@ -380,9 +445,6 @@ TaskScheduler *BLI_task_scheduler_create(int num_threads)
 				fprintf(stderr, "TaskScheduler failed to launch thread %d/%d\n", i, num_threads);
 			}
 		}
-
-		scheduler->task_mempool = MEM_callocN(sizeof(*scheduler->task_mempool) * (num_threads + 1),
-		                                      "TaskScheduler task_mempool");
 	}
 
 	return scheduler;
@@ -398,6 +460,8 @@ void BLI_task_scheduler_free(TaskScheduler *scheduler)
 	BLI_condition_notify_all(&scheduler->queue_cond);
 	BLI_mutex_unlock(&scheduler->queue_mutex);
 
+	pthread_key_delete(scheduler->tls_id_key);
+
 	/* delete threads */
 	if (scheduler->threads) {
 		int i;
@@ -412,17 +476,12 @@ void BLI_task_scheduler_free(TaskScheduler *scheduler)
 
 	/* Delete task thread data */
 	if (scheduler->task_threads) {
-		MEM_freeN(scheduler->task_threads);
-	}
-
-	/* Delete task memory pool */
-	if (scheduler->task_mempool) {
-		for (int i = 0; i <= scheduler->num_threads; ++i) {
-			for (int j = 0; j < scheduler->task_mempool[i].num_tasks; ++j) {
-				MEM_freeN(scheduler->task_mempool[i].tasks[j]);
-			}
+		for (int i = 0; i < scheduler->num_threads + 1; ++i) {
+			TaskThreadLocalStorage *tls = &scheduler->task_threads[i].tls;
+			free_task_tls(tls);
 		}
-		MEM_freeN(scheduler->task_mempool);
+
+		MEM_freeN(scheduler->task_threads);
 	}
 
 	/* delete leftover tasks */
@@ -445,7 +504,7 @@ int BLI_task_scheduler_num_threads(TaskScheduler *scheduler)
 
 static void task_scheduler_push(TaskScheduler *scheduler, Task *task, TaskPriority priority)
 {
-	task_pool_num_increase(task->pool);
+	task_pool_num_increase(task->pool, 1);
 
 	/* add task to queue */
 	BLI_mutex_lock(&scheduler->queue_mutex);
@@ -471,7 +530,7 @@ static void task_scheduler_clear(TaskScheduler *scheduler, TaskPool *pool)
 		nexttask = task->next;
 
 		if (task->pool == pool) {
-			task_data_free(task, 0);
+			task_data_free(task, pool->thread_id);
 			BLI_freelinkN(&scheduler->queue, task);
 
 			done++;
@@ -486,7 +545,10 @@ static void task_scheduler_clear(TaskScheduler *scheduler, TaskPool *pool)
 
 /* Task Pool */
 
-static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, const bool is_background)
+static TaskPool *task_pool_create_ex(TaskScheduler *scheduler,
+                                     void *userdata,
+                                     const bool is_background,
+                                     const bool is_suspended)
 {
 	TaskPool *pool = MEM_mallocN(sizeof(TaskPool), "TaskPool");
 
@@ -504,10 +566,11 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
 
 	pool->scheduler = scheduler;
 	pool->num = 0;
-	pool->done = 0;
-	pool->num_threads = 0;
-	pool->currently_running_tasks = 0;
 	pool->do_cancel = false;
+	pool->do_work = false;
+	pool->is_suspended = is_suspended;
+	pool->num_suspended = 0;
+	pool->suspended_queue.first = pool->suspended_queue.last = NULL;
 	pool->run_in_background = is_background;
 
 	BLI_mutex_init(&pool->num_mutex);
@@ -517,11 +580,21 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
 	BLI_mutex_init(&pool->user_mutex);
 
 	if (BLI_thread_is_main()) {
-		pool->task_mempool = scheduler->task_mempool;
+		pool->thread_id = 0;
 	}
 	else {
-		pool->task_mempool = &pool->task_mempool_local;
-		pool->task_mempool_local.num_tasks = 0;
+		TaskThread *thread = pthread_getspecific(scheduler->tls_id_key);
+		/* NOTE: It is possible that pool is created from non-main thread
+		 * which isn't a scheduler thread. In this case pthread's TLS will
+		 * be NULL and we can safely consider thread id 0 for the main
+		 * thread of this pool (the one which does wort_and_wait()).
+		 */
+		if (thread == NULL) {
+			pool->thread_id = 0;
+		}
+		else {
+			pool->thread_id = thread->id;
+		}
 	}
 
 #ifdef DEBUG_STATS
@@ -548,7 +621,7 @@ static TaskPool *task_pool_create_ex(TaskScheduler *scheduler, void *userdata, c
  */
 TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata)
 {
-	return task_pool_create_ex(scheduler, userdata, false);
+	return task_pool_create_ex(scheduler, userdata, false, false);
 }
 
 /**
@@ -563,25 +636,28 @@ TaskPool *BLI_task_pool_create(TaskScheduler *scheduler, void *userdata)
  */
 TaskPool *BLI_task_pool_create_background(TaskScheduler *scheduler, void *userdata)
 {
-	return task_pool_create_ex(scheduler, userdata, true);
+	return task_pool_create_ex(scheduler, userdata, true, false);
+}
+
+/**
+ * Similar to BLI_task_pool_create() but does not schedule any tasks for execution
+ * for until BLI_task_pool_work_and_wait() is called. This helps reducing therading
+ * overhead when pushing huge amount of small initial tasks from the main thread.
+ */
+TaskPool *BLI_task_pool_create_suspended(TaskScheduler *scheduler, void *userdata)
+{
+	return task_pool_create_ex(scheduler, userdata, false, true);
 }
 
 void BLI_task_pool_free(TaskPool *pool)
 {
-	BLI_task_pool_stop(pool);
+	BLI_task_pool_cancel(pool);
 
 	BLI_mutex_end(&pool->num_mutex);
 	BLI_condition_end(&pool->num_cond);
 
 	BLI_mutex_end(&pool->user_mutex);
 
-	/* Free local memory pool, those pointers are lost forever. */
-	if (pool->task_mempool == &pool->task_mempool_local) {
-		for (int i = 0; i < pool->task_mempool_local.num_tasks; i++) {
-			MEM_freeN(pool->task_mempool_local.tasks[i]);
-		}
-	}
-
 #ifdef DEBUG_STATS
 	printf("Thread ID    Allocated   Reused   Discarded\n");
 	for (int i = 0; i < pool->scheduler->num_threads + 1; ++i) {
@@ -612,6 +688,25 @@ static void task_pool_push(
 	task->freedata = freedata;
 	task->pool = pool;
 
+	if (pool->is_suspended) {
+		BLI_addhead(&pool->suspended_queue, task);
+		atomic_fetch_and_add_z(&pool->num_suspended, 1);
+		return;
+	}
+
+	if (thread_id != -1 &&
+	    (thread_id != pool->thread_id || pool->do_work))
+	{
+		ASSERT_THREAD_ID(pool->scheduler, thread_id);
+
+		TaskThreadLocalStorage *tls = get_task_tls(pool, thread_id);
+		if (tls->num_local_queue < LOCALQUEUE_SIZE) {
+			tls->local_queue[tls->num_local_queue] = task;
+			tls->num_local_queue++;
+			return;
+		}
+	}
+
 	task_scheduler_push(pool->scheduler, task, priority);
 }
 
@@ -636,8 +731,27 @@ void BLI_task_pool_push_from_thread(TaskPool *pool, TaskRunFunction run,
 
 void BLI_task_pool_work_and_wait(TaskPool *pool)
 {
+	TaskThreadLocalStorage *tls = get_task_tls(pool, pool->thread_id);
 	TaskScheduler *scheduler = pool->scheduler;
 
+	if (atomic_fetch_and_and_uint8((uint8_t *)&pool->is_suspended, 0)) {
+		if (pool->num_suspended) {
+			task_pool_num_increase(pool, pool->num_suspended);
+			BLI_mutex_lock(&scheduler->queue_mutex);
+
+			BLI_movelisttolist(&scheduler->queue, &pool->suspended_queue);
+
+			BLI_condition_notify_all(&scheduler->queue_cond);
+			BLI_mutex_unlock(&scheduler->queue_mutex);
+
+		}
+		pool->is_suspended = false;
+	}
+
+	pool->do_work = true;
+
+	ASSERT_THREAD_ID(pool->scheduler, pool->thread_id);
+
 	BLI_mutex_lock(&pool->num_mutex);
 
 	while (pool->num != 0) {
@@ -651,16 +765,12 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
 		/* find task from this pool. if we get a task from another pool,
 		 * we can get into deadlock */
 
-		if (pool->num_threads == 0 ||
-		    pool->currently_running_tasks < pool->num_threads)
-		{
-			for (task = scheduler->queue.first; task; task = task->next) {
-				if (task->pool == pool) {
-					work_task = task;
-					found_task = true;
-					BLI_remlink(&scheduler->queue, task);
-					break;
-				}
+		for (task = scheduler->queue.first; task; task = task->next) {
+			if (task->pool == pool) {
+				work_task = task;
+				found_task = true;
+				BLI_remlink(&scheduler->queue, task);
+				break;
 			}
 		}
 
@@ -669,11 +779,13 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
 		/* if found task, do it, otherwise wait until other tasks are done */
 		if (found_task) {
 			/* run task */
-			atomic_add_and_fetch_z(&pool->currently_running_tasks, 1);
-			work_task->run(pool, work_task->taskdata, 0);
+			work_task->run(pool, work_task->taskdata, pool->thread_id);
 
 			/* delete task */
-			task_free(pool, task, 0);
+			task_free(pool, task, pool->thread_id);
+
+			/* Handle all tasks from local queue. */
+			handle_local_queue(tls, pool->thread_id);
 
 			/* notify pool task was done */
 			task_pool_num_decrease(pool, 1);
@@ -688,22 +800,8 @@ void BLI_task_pool_work_and_wait(TaskPool *pool)
 	}
 
 	BLI_mutex_unlock(&pool->num_mutex);
-}
 
-int BLI_pool_get_num_threads(TaskPool *pool)
-{
-	if (pool->num_threads != 0) {
-		return pool->num_threads;
-	}
-	else {
-		return BLI_task_scheduler_num_threads(pool->scheduler);
-	}
-}
-
-void BLI_pool_set_num_threads(TaskPool *pool, int num_threads)
-{
-	/* NOTE: Don't try to modify threads while tasks are running! */
-	pool->num_threads = num_threads;
+	handle_local_queue(tls, pool->thread_id);
 }
 
 void BLI_task_pool_cancel(TaskPool *pool)
@@ -721,13 +819,6 @@ void BLI_task_pool_cancel(TaskPool *pool)
 	pool->do_cancel = false;
 }
 
-void BLI_task_pool_stop(TaskPool *pool)
-{
-	task_scheduler_clear(pool->scheduler, pool);
-
-	BLI_assert(pool->num == 0);
-}
-
 bool BLI_task_pool_canceled(TaskPool *pool)
 {
 	return pool->do_cancel;
@@ -743,11 +834,6 @@ ThreadMutex *BLI_task_pool_user_mutex(TaskPool *pool)
 	return &pool->user_mutex;
 }
 
-size_t BLI_task_pool_tasks_done(TaskPool *pool)
-{
-	return pool->done;
-}
-
 /* Parallel range routines */
 
 /**
@@ -783,7 +869,7 @@ BLI_INLINE bool parallel_range_next_iter_get(
         int * __restrict iter, int * __restrict count)
 {
 	uint32_t uval = atomic_fetch_and_add_uint32((uint32_t *)(&state->iter), state->chunk_size);
-	int previter = *(int32_t*)&uval;
+	int previter = *(int32_t *)&uval;
 
 	*iter = previter;
 	*count = max_ii(0, min_ii(state->chunk_size, state->stop - previter));
@@ -918,7 +1004,8 @@ static void task_parallel_range_ex(
 		BLI_task_pool_push_from_thread(task_pool,
 		                               parallel_range_func,
 		                               userdata_chunk_local, false,
-		                               TASK_PRIORITY_HIGH, 0);
+		                               TASK_PRIORITY_HIGH,
+		                               task_pool->thread_id);
 	}
 
 	BLI_task_pool_work_and_wait(task_pool);
@@ -1124,7 +1211,8 @@ void BLI_task_parallel_listbase(
 		BLI_task_pool_push_from_thread(task_pool,
 		                               parallel_listbase_func,
 		                               NULL, false,
-		                               TASK_PRIORITY_HIGH, 0);
+		                               TASK_PRIORITY_HIGH,
+		                               task_pool->thread_id);
 	}
 
 	BLI_task_pool_work_and_wait(task_pool);
diff --git a/source/blender/blenlib/intern/threads.c b/source/blender/blenlib/intern/threads.c
index b60981802aa..77da3be0600 100644
--- a/source/blender/blenlib/intern/threads.c
+++ b/source/blender/blenlib/intern/threads.c
@@ -54,6 +54,8 @@
 #  include <sys/time.h>
 #endif
 
+#include "atomic_ops.h"
+
 #if defined(__APPLE__) && defined(_OPENMP) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 2) && !defined(__clang__)
 #  define USE_APPLE_OMP_FIX
 #endif
@@ -124,7 +126,7 @@ static pthread_mutex_t _colormanage_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _fftw_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_mutex_t _view3d_lock = PTHREAD_MUTEX_INITIALIZER;
 static pthread_t mainid;
-static int thread_levels = 0;  /* threads can be invoked inside threads */
+static unsigned int thread_levels = 0;  /* threads can be invoked inside threads */
 static int num_threads_override = 0;
 
 /* just a max for security reasons */
@@ -198,9 +200,9 @@ void BLI_init_threads(ListBase *threadbase, void *(*do_thread)(void *), int tot)
 			tslot->avail = 1;
 		}
 	}
-	
-	BLI_spin_lock(&_malloc_lock);
-	if (thread_levels == 0) {
+
+	unsigned int level = atomic_fetch_and_add_u(&thread_levels, 1);
+	if (level == 0) {
 		MEM_set_lock_callback(BLI_lock_malloc_thread, BLI_unlock_malloc_thread);
 
 #ifdef USE_APPLE_OMP_FIX
@@ -210,9 +212,6 @@ void BLI_init_threads(ListBase *threadbase, void *(*do_thread)(void *), int tot)
 		thread_tls_data = pthread_getspecific(gomp_tls_key);
 #endif
 	}
-
-	thread_levels++;
-	BLI_spin_unlock(&_malloc_lock);
 }
 
 /* amount of available threads */
@@ -331,11 +330,10 @@ void BLI_end_threads(ListBase *threadbase)
 		BLI_freelistN(threadbase);
 	}
 
-	BLI_spin_lock(&_malloc_lock);
-	thread_levels--;
-	if (thread_levels == 0)
+	unsigned int level = atomic_sub_and_fetch_u(&thread_levels, 1);
+	if (level == 0) {
 		MEM_set_lock_callback(NULL, NULL);
-	BLI_spin_unlock(&_malloc_lock);
+	}
 }
 
 /* System Information */
@@ -812,26 +810,17 @@ void BLI_thread_queue_wait_finish(ThreadQueue *queue)
 
 void BLI_begin_threaded_malloc(void)
 {
-	/* Used for debug only */
-	/* BLI_assert(thread_levels >= 0); */
-
-	BLI_spin_lock(&_malloc_lock);
-	if (thread_levels == 0) {
+	unsigned int level = atomic_fetch_and_add_u(&thread_levels, 1);
+	if (level == 0) {
 		MEM_set_lock_callback(BLI_lock_malloc_thread, BLI_unlock_malloc_thread);
 	}
-	thread_levels++;
-	BLI_spin_unlock(&_malloc_lock);
 }
 
 void BLI_end_threaded_malloc(void)
 {
-	/* Used for debug only */
-	/* BLI_assert(thread_levels >= 0); */
-
-	BLI_spin_lock(&_malloc_lock);
-	thread_levels--;
-	if (thread_levels == 0)
+	unsigned int level = atomic_sub_and_fetch_u(&thread_levels, 1);
+	if (level == 0) {
 		MEM_set_lock_callback(NULL, NULL);
-	BLI_spin_unlock(&_malloc_lock);
+	}
 }
 
diff --git a/source/blender/blenloader/BLO_readfile.h b/source/blender/blenloader/BLO_readfile.h
index c85cf128643..e6fc4703248 100644
--- a/source/blender/blenloader/BLO_readfile.h
+++ b/source/blender/blenloader/BLO_readfile.h
@@ -71,11 +71,25 @@ typedef struct BlendFileData {
 	BlenFileType type;
 } BlendFileData;
 
-BlendFileData *BLO_read_from_file(const char *filepath, struct ReportList *reports);
-BlendFileData *BLO_read_from_memory(const void *mem, int memsize, struct ReportList *reports);
+
+/* skip reading some data-block types (may want to skip screen data too). */
+typedef enum eBLOReadSkip {
+	BLO_READ_SKIP_NONE          = 0,
+	BLO_READ_SKIP_USERDEF       = (1 << 0),
+	BLO_READ_SKIP_DATA          = (1 << 1),
+} eBLOReadSkip;
+#define BLO_READ_SKIP_ALL \
+	(BLO_READ_SKIP_USERDEF | BLO_READ_SKIP_DATA)
+
+BlendFileData *BLO_read_from_file(
+        const char *filepath,
+        struct ReportList *reports, eBLOReadSkip skip_flag);
+BlendFileData *BLO_read_from_memory(
+        const void *mem, int memsize,
+        struct ReportList *reports, eBLOReadSkip skip_flag);
 BlendFileData *BLO_read_from_memfile(
         struct Main *oldmain, const char *filename, struct MemFile *memfile,
-        struct ReportList *reports);
+        struct ReportList *reports, eBLOReadSkip skip_flag);
 
 void BLO_blendfiledata_free(BlendFileData *bfd);
 
diff --git a/source/blender/blenloader/CMakeLists.txt b/source/blender/blenloader/CMakeLists.txt
index 8cb9ef837b2..3d2e8a306de 100644
--- a/source/blender/blenloader/CMakeLists.txt
+++ b/source/blender/blenloader/CMakeLists.txt
@@ -69,6 +69,12 @@ if(WITH_BUILDINFO)
 	add_definitions(-DWITH_BUILDINFO)
 endif()
 
+if(WITH_PYTHON)
+	if(WITH_PYTHON_SECURITY)
+		add_definitions(-DWITH_PYTHON_SECURITY)
+	endif()
+endif()
+
 if(WITH_INTERNATIONAL)
 	add_definitions(-DWITH_INTERNATIONAL)
 endif()
diff --git a/source/blender/blenloader/intern/readblenentry.c b/source/blender/blenloader/intern/readblenentry.c
index be893177b3b..73109413271 100644
--- a/source/blender/blenloader/intern/readblenentry.c
+++ b/source/blender/blenloader/intern/readblenentry.c
@@ -317,7 +317,9 @@ void BLO_blendhandle_close(BlendHandle *bh)
  * \param reports If the return value is NULL, errors indicating the cause of the failure.
  * \return The data of the file.
  */
-BlendFileData *BLO_read_from_file(const char *filepath, ReportList *reports)
+BlendFileData *BLO_read_from_file(
+        const char *filepath,
+        ReportList *reports, eBLOReadSkip skip_flags)
 {
 	BlendFileData *bfd = NULL;
 	FileData *fd;
@@ -325,6 +327,7 @@ BlendFileData *BLO_read_from_file(const char *filepath, ReportList *reports)
 	fd = blo_openblenderfile(filepath, reports);
 	if (fd) {
 		fd->reports = reports;
+		fd->skip_flags = skip_flags;
 		bfd = blo_read_file_internal(fd, filepath);
 		blo_freefiledata(fd);
 	}
@@ -341,7 +344,9 @@ BlendFileData *BLO_read_from_file(const char *filepath, ReportList *reports)
  * \param reports If the return value is NULL, errors indicating the cause of the failure.
  * \return The data of the file.
  */
-BlendFileData *BLO_read_from_memory(const void *mem, int memsize, ReportList *reports)
+BlendFileData *BLO_read_from_memory(
+        const void *mem, int memsize,
+        ReportList *reports, eBLOReadSkip skip_flags)
 {
 	BlendFileData *bfd = NULL;
 	FileData *fd;
@@ -349,6 +354,7 @@ BlendFileData *BLO_read_from_memory(const void *mem, int memsize, ReportList *re
 	fd = blo_openblendermemory(mem, memsize,  reports);
 	if (fd) {
 		fd->reports = reports;
+		fd->skip_flags = skip_flags;
 		bfd = blo_read_file_internal(fd, "");
 		blo_freefiledata(fd);
 	}
@@ -362,7 +368,9 @@ BlendFileData *BLO_read_from_memory(const void *mem, int memsize, ReportList *re
  * \param oldmain old main, from which we will keep libraries and other datablocks that should not have changed.
  * \param filename current file, only for retrieving library data.
  */
-BlendFileData *BLO_read_from_memfile(Main *oldmain, const char *filename, MemFile *memfile, ReportList *reports)
+BlendFileData *BLO_read_from_memfile(
+        Main *oldmain, const char *filename, MemFile *memfile,
+        ReportList *reports, eBLOReadSkip skip_flags)
 {
 	BlendFileData *bfd = NULL;
 	FileData *fd;
@@ -371,6 +379,7 @@ BlendFileData *BLO_read_from_memfile(Main *oldmain, const char *filename, MemFil
 	fd = blo_openblendermemfile(memfile, reports);
 	if (fd) {
 		fd->reports = reports;
+		fd->skip_flags = skip_flags;
 		BLI_strncpy(fd->relabase, filename, sizeof(fd->relabase));
 		
 		/* clear ob->proxy_from pointers in old main */
diff --git a/source/blender/blenloader/intern/readfile.c b/source/blender/blenloader/intern/readfile.c
index 6264336c096..09c88ac945a 100644
--- a/source/blender/blenloader/intern/readfile.c
+++ b/source/blender/blenloader/intern/readfile.c
@@ -2192,19 +2192,19 @@ static void direct_link_curvemapping(FileData *fd, CurveMapping *cumap)
 /* library brush linking after fileread */
 static void lib_link_brush(FileData *fd, Main *main)
 {
-	Brush *brush;
-	
 	/* only link ID pointers */
-	for (brush = main->brush.first; brush; brush = brush->id.next) {
+	for (Brush *brush = main->brush.first; brush; brush = brush->id.next) {
 		if (brush->id.tag & LIB_TAG_NEED_LINK) {
-			brush->id.tag &= ~LIB_TAG_NEED_LINK;
-			
+			IDP_LibLinkProperty(brush->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
+
 			/* brush->(mask_)mtex.obj is ignored on purpose? */
 			brush->mtex.tex = newlibadr_us(fd, brush->id.lib, brush->mtex.tex);
 			brush->mask_mtex.tex = newlibadr_us(fd, brush->id.lib, brush->mask_mtex.tex);
 			brush->clone.image = newlibadr(fd, brush->id.lib, brush->clone.image);
 			brush->toggle_brush = newlibadr(fd, brush->id.lib, brush->toggle_brush);
 			brush->paint_curve = newlibadr_us(fd, brush->id.lib, brush->paint_curve);
+
+			brush->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
 }
@@ -2227,13 +2227,13 @@ static void direct_link_brush(FileData *fd, Brush *brush)
 }
 
 /* ************ READ Palette *************** */
-static void lib_link_palette(FileData *UNUSED(fd), Main *main)
+static void lib_link_palette(FileData *fd, Main *main)
 {
-	Palette *palette;
-
 	/* only link ID pointers */
-	for (palette = main->palettes.first; palette; palette = palette->id.next) {
+	for (Palette *palette = main->palettes.first; palette; palette = palette->id.next) {
 		if (palette->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(palette->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
+
 			palette->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
@@ -2245,13 +2245,13 @@ static void direct_link_palette(FileData *fd, Palette *palette)
 	link_list(fd, &palette->colors);
 }
 
-static void lib_link_paint_curve(FileData *UNUSED(fd), Main *main)
+static void lib_link_paint_curve(FileData *fd, Main *main)
 {
-	PaintCurve *pc;
-
 	/* only link ID pointers */
-	for (pc = main->paintcurves.first; pc; pc = pc->id.next) {
+	for (PaintCurve *pc = main->paintcurves.first; pc; pc = pc->id.next) {
 		if (pc->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(pc->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
+
 			pc->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
@@ -2503,15 +2503,12 @@ static void direct_link_fcurves(FileData *fd, ListBase *list)
 
 static void lib_link_action(FileData *fd, Main *main)
 {
-	bAction *act;
-	bActionChannel *chan;
-
-	for (act = main->action.first; act; act = act->id.next) {
+	for (bAction *act = main->action.first; act; act = act->id.next) {
 		if (act->id.tag & LIB_TAG_NEED_LINK) {
-			act->id.tag &= ~LIB_TAG_NEED_LINK;
+			IDP_LibLinkProperty(act->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			
 // XXX deprecated - old animation system <<<
-			for (chan=act->chanbase.first; chan; chan=chan->next) {
+			for (bActionChannel *chan = act->chanbase.first; chan; chan = chan->next) {
 				chan->ipo = newlibadr_us(fd, act->id.lib, chan->ipo);
 				lib_link_constraint_channels(fd, &act->id, &chan->constraintChannels);
 			}
@@ -2524,6 +2521,8 @@ static void lib_link_action(FileData *fd, Main *main)
 					marker->camera = newlibadr(fd, act->id.lib, marker->camera);
 				}
 			}
+
+			act->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
 }
@@ -2710,26 +2709,20 @@ static void direct_link_animdata(FileData *fd, AnimData *adt)
 
 static void lib_link_cachefiles(FileData *fd, Main *bmain)
 {
-	CacheFile *cache_file;
-
 	/* only link ID pointers */
-	for (cache_file = bmain->cachefiles.first; cache_file; cache_file = cache_file->id.next) {
+	for (CacheFile *cache_file = bmain->cachefiles.first; cache_file; cache_file = cache_file->id.next) {
 		if (cache_file->id.tag & LIB_TAG_NEED_LINK) {
-			cache_file->id.tag &= ~LIB_TAG_NEED_LINK;
-		}
-
-		BLI_listbase_clear(&cache_file->object_paths);
-		cache_file->handle = NULL;
-		cache_file->handle_mutex = NULL;
-
-		if (cache_file->adt) {
+			IDP_LibLinkProperty(cache_file->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &cache_file->id, cache_file->adt);
+
+			cache_file->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
 }
 
 static void direct_link_cachefile(FileData *fd, CacheFile *cache_file)
 {
+	BLI_listbase_clear(&cache_file->object_paths);
 	cache_file->handle = NULL;
 	cache_file->handle_mutex = NULL;
 
@@ -2766,6 +2759,7 @@ static void lib_link_ntree(FileData *fd, ID *id, bNodeTree *ntree)
 	bNode *node;
 	bNodeSocket *sock;
 	
+	IDP_LibLinkProperty(ntree->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 	lib_link_animdata(fd, &ntree->id, ntree->adt);
 	
 	ntree->gpd = newlibadr_us(fd, id->lib, ntree->gpd);
@@ -2775,30 +2769,37 @@ static void lib_link_ntree(FileData *fd, ID *id, bNodeTree *ntree)
 		 * of library blocks that implement this.*/
 		IDP_LibLinkProperty(node->prop, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 		
-		node->id= newlibadr_us(fd, id->lib, node->id);
+		node->id = newlibadr_us(fd, id->lib, node->id);
 
-		for (sock = node->inputs.first; sock; sock = sock->next)
+		for (sock = node->inputs.first; sock; sock = sock->next) {
+			IDP_LibLinkProperty(sock->prop, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_node_socket(fd, id, sock);
-		for (sock = node->outputs.first; sock; sock = sock->next)
+		}
+		for (sock = node->outputs.first; sock; sock = sock->next) {
+			IDP_LibLinkProperty(sock->prop, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_node_socket(fd, id, sock);
+		}
 	}
 	
-	for (sock = ntree->inputs.first; sock; sock = sock->next)
+	for (sock = ntree->inputs.first; sock; sock = sock->next) {
+		IDP_LibLinkProperty(sock->prop, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 		lib_link_node_socket(fd, id, sock);
-	for (sock = ntree->outputs.first; sock; sock = sock->next)
+	}
+	for (sock = ntree->outputs.first; sock; sock = sock->next) {
+		IDP_LibLinkProperty(sock->prop, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 		lib_link_node_socket(fd, id, sock);
+	}
 }
 
 /* library ntree linking after fileread */
 static void lib_link_nodetree(FileData *fd, Main *main)
 {
-	bNodeTree *ntree;
-	
 	/* only link ID pointers */
-	for (ntree = main->nodetree.first; ntree; ntree = ntree->id.next) {
+	for (bNodeTree *ntree = main->nodetree.first; ntree; ntree = ntree->id.next) {
 		if (ntree->id.tag & LIB_TAG_NEED_LINK) {
-			ntree->id.tag &= ~LIB_TAG_NEED_LINK;
 			lib_link_ntree(fd, &ntree->id, ntree);
+
+			ntree->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
 }
@@ -3313,11 +3314,11 @@ static void lib_link_pose(FileData *fd, Main *bmain, Object *ob, bPose *pose)
 
 static void lib_link_armature(FileData *fd, Main *main)
 {
-	bArmature *arm;
-	
-	for (arm = main->armature.first; arm; arm = arm->id.next) {
+	for (bArmature *arm = main->armature.first; arm; arm = arm->id.next) {
 		if (arm->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(arm->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &arm->id, arm->adt);
+
 			arm->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
@@ -3362,14 +3363,13 @@ static void direct_link_armature(FileData *fd, bArmature *arm)
 
 static void lib_link_camera(FileData *fd, Main *main)
 {
-	Camera *ca;
-	
-	for (ca = main->camera.first; ca; ca = ca->id.next) {
+	for (Camera *ca = main->camera.first; ca; ca = ca->id.next) {
 		if (ca->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(ca->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &ca->id, ca->adt);
 			
 			ca->ipo = newlibadr_us(fd, ca->id.lib, ca->ipo); // XXX deprecated - old animation system
-			
+
 			ca->dof_ob = newlibadr(fd, ca->id.lib, ca->dof_ob);
 			
 			ca->id.tag &= ~LIB_TAG_NEED_LINK;
@@ -3388,16 +3388,13 @@ static void direct_link_camera(FileData *fd, Camera *ca)
 
 static void lib_link_lamp(FileData *fd, Main *main)
 {
-	Lamp *la;
-	MTex *mtex;
-	int a;
-	
-	for (la = main->lamp.first; la; la = la->id.next) {
+	for (Lamp *la = main->lamp.first; la; la = la->id.next) {
 		if (la->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(la->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &la->id, la->adt);
 			
-			for (a = 0; a < MAX_MTEX; a++) {
-				mtex = la->mtex[a];
+			for (int a = 0; a < MAX_MTEX; a++) {
+				MTex *mtex = la->mtex[a];
 				if (mtex) {
 					mtex->tex = newlibadr_us(fd, la->id.lib, mtex->tex);
 					mtex->object = newlibadr(fd, la->id.lib, mtex->object);
@@ -3454,17 +3451,11 @@ void blo_do_versions_key_uidgen(Key *key)
 
 static void lib_link_key(FileData *fd, Main *main)
 {
-	Key *key;
-	
-	for (key = main->key.first; key; key = key->id.next) {
-		/*check if we need to generate unique ids for the shapekeys*/
-		if (!key->uidgen) {
-			blo_do_versions_key_uidgen(key);
-		}
-		
+	for (Key *key = main->key.first; key; key = key->id.next) {
 		BLI_assert((key->id.tag & LIB_TAG_EXTERN) == 0);
 
 		if (key->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(key->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &key->id, key->adt);
 			
 			key->ipo = newlibadr_us(fd, key->id.lib, key->ipo); // XXX deprecated - old animation system
@@ -3527,15 +3518,14 @@ static void direct_link_key(FileData *fd, Key *key)
 
 static void lib_link_mball(FileData *fd, Main *main)
 {
-	MetaBall *mb;
-	int a;
-	
-	for (mb = main->mball.first; mb; mb = mb->id.next) {
+	for (MetaBall *mb = main->mball.first; mb; mb = mb->id.next) {
 		if (mb->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(mb->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &mb->id, mb->adt);
 			
-			for (a = 0; a < mb->totcol; a++) 
+			for (int a = 0; a < mb->totcol; a++) {
 				mb->mat[a] = newlibadr_us(fd, mb->id.lib, mb->mat[a]);
+			}
 			
 			mb->ipo = newlibadr_us(fd, mb->id.lib, mb->ipo); // XXX deprecated - old animation system
 			
@@ -3564,18 +3554,15 @@ static void direct_link_mball(FileData *fd, MetaBall *mb)
 
 static void lib_link_world(FileData *fd, Main *main)
 {
-	World *wrld;
-	MTex *mtex;
-	int a;
-	
-	for (wrld = main->world.first; wrld; wrld = wrld->id.next) {
+	for (World *wrld = main->world.first; wrld; wrld = wrld->id.next) {
 		if (wrld->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(wrld->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &wrld->id, wrld->adt);
 			
 			wrld->ipo = newlibadr_us(fd, wrld->id.lib, wrld->ipo); // XXX deprecated - old animation system
 			
-			for (a=0; a < MAX_MTEX; a++) {
-				mtex = wrld->mtex[a];
+			for (int a = 0; a < MAX_MTEX; a++) {
+				MTex *mtex = wrld->mtex[a];
 				if (mtex) {
 					mtex->tex = newlibadr_us(fd, wrld->id.lib, mtex->tex);
 					mtex->object = newlibadr(fd, wrld->id.lib, mtex->object);
@@ -3616,12 +3603,12 @@ static void direct_link_world(FileData *fd, World *wrld)
 
 /* ************ READ VFONT ***************** */
 
-static void lib_link_vfont(FileData *UNUSED(fd), Main *main)
+static void lib_link_vfont(FileData *fd, Main *main)
 {
-	VFont *vf;
-	
-	for (vf = main->vfont.first; vf; vf = vf->id.next) {
+	for (VFont *vf = main->vfont.first; vf; vf = vf->id.next) {
 		if (vf->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(vf->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
+
 			vf->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
@@ -3636,12 +3623,12 @@ static void direct_link_vfont(FileData *fd, VFont *vf)
 
 /* ************ READ TEXT ****************** */
 
-static void lib_link_text(FileData *UNUSED(fd), Main *main)
+static void lib_link_text(FileData *fd, Main *main)
 {
-	Text *text;
-	
-	for (text = main->text.first; text; text = text->id.next) {
+	for (Text *text = main->text.first; text; text = text->id.next) {
 		if (text->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(text->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
+
 			text->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
@@ -3690,9 +3677,7 @@ static void direct_link_text(FileData *fd, Text *text)
 
 static void lib_link_image(FileData *fd, Main *main)
 {
-	Image *ima;
-	
-	for (ima = main->image.first; ima; ima = ima->id.next) {
+	for (Image *ima = main->image.first; ima; ima = ima->id.next) {
 		if (ima->id.tag & LIB_TAG_NEED_LINK) {
 			IDP_LibLinkProperty(ima->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			
@@ -3759,15 +3744,14 @@ static void direct_link_image(FileData *fd, Image *ima)
 
 static void lib_link_curve(FileData *fd, Main *main)
 {
-	Curve *cu;
-	int a;
-	
-	for (cu = main->curve.first; cu; cu = cu->id.next) {
+	for (Curve *cu = main->curve.first; cu; cu = cu->id.next) {
 		if (cu->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(cu->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &cu->id, cu->adt);
 			
-			for (a = 0; a < cu->totcol; a++) 
+			for (int a = 0; a < cu->totcol; a++) {
 				cu->mat[a] = newlibadr_us(fd, cu->id.lib, cu->mat[a]);
+			}
 			
 			cu->bevobj = newlibadr(fd, cu->id.lib, cu->bevobj);
 			cu->taperobj = newlibadr(fd, cu->id.lib, cu->taperobj);
@@ -3852,10 +3836,9 @@ static void direct_link_curve(FileData *fd, Curve *cu)
 
 static void lib_link_texture(FileData *fd, Main *main)
 {
-	Tex *tex;
-	
-	for (tex = main->tex.first; tex; tex = tex->id.next) {
+	for (Tex *tex = main->tex.first; tex; tex = tex->id.next) {
 		if (tex->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(tex->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &tex->id, tex->adt);
 			
 			tex->ima = newlibadr_us(fd, tex->id.lib, tex->ima);
@@ -3931,12 +3914,9 @@ static void direct_link_texture(FileData *fd, Tex *tex)
 
 static void lib_link_material(FileData *fd, Main *main)
 {
-	Material *ma;
-	MTex *mtex;
-	int a;
-	
-	for (ma = main->mat.first; ma; ma = ma->id.next) {
+	for (Material *ma = main->mat.first; ma; ma = ma->id.next) {
 		if (ma->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(ma->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &ma->id, ma->adt);
 			
 			/* Link ID Properties -- and copy this comment EXACTLY for easy finding
@@ -3946,8 +3926,8 @@ static void lib_link_material(FileData *fd, Main *main)
 			ma->ipo = newlibadr_us(fd, ma->id.lib, ma->ipo);  // XXX deprecated - old animation system
 			ma->group = newlibadr_us(fd, ma->id.lib, ma->group);
 			
-			for (a = 0; a < MAX_MTEX; a++) {
-				mtex = ma->mtex[a];
+			for (int a = 0; a < MAX_MTEX; a++) {
+				MTex *mtex = ma->mtex[a];
 				if (mtex) {
 					mtex->tex = newlibadr_us(fd, ma->id.lib, mtex->tex);
 					mtex->object = newlibadr(fd, ma->id.lib, mtex->object);
@@ -4078,14 +4058,11 @@ static void lib_link_partdeflect(FileData *fd, ID *id, PartDeflect *pd)
 
 static void lib_link_particlesettings(FileData *fd, Main *main)
 {
-	ParticleSettings *part;
-	ParticleDupliWeight *dw;
-	MTex *mtex;
-	int a;
-	
-	for (part = main->particle.first; part; part = part->id.next) {
+	for (ParticleSettings *part = main->particle.first; part; part = part->id.next) {
 		if (part->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(part->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &part->id, part->adt);
+
 			part->ipo = newlibadr_us(fd, part->id.lib, part->ipo); // XXX deprecated - old animation system
 			
 			part->dup_ob = newlibadr(fd, part->id.lib, part->dup_ob);
@@ -4105,6 +4082,7 @@ static void lib_link_particlesettings(FileData *fd, Main *main)
 			}
 
 			if (part->dupliweights.first && part->dup_group) {
+				ParticleDupliWeight *dw;
 				int index_ok = 0;
 				/* check for old files without indices (all indexes 0) */
 				if (BLI_listbase_is_single(&part->dupliweights)) {
@@ -4169,8 +4147,8 @@ static void lib_link_particlesettings(FileData *fd, Main *main)
 				}
 			}
 
-			for (a = 0; a < MAX_MTEX; a++) {
-				mtex= part->mtex[a];
+			for (int a = 0; a < MAX_MTEX; a++) {
+				MTex *mtex= part->mtex[a];
 				if (mtex) {
 					mtex->tex = newlibadr_us(fd, part->id.lib, mtex->tex);
 					mtex->object = newlibadr(fd, part->id.lib, mtex->object);
@@ -4677,10 +4655,9 @@ static void direct_link_mesh(FileData *fd, Mesh *mesh)
 
 static void lib_link_latt(FileData *fd, Main *main)
 {
-	Lattice *lt;
-	
-	for (lt = main->latt.first; lt; lt = lt->id.next) {
+	for (Lattice *lt = main->latt.first; lt; lt = lt->id.next) {
 		if (lt->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(lt->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &lt->id, lt->adt);
 			
 			lt->ipo = newlibadr_us(fd, lt->id.lib, lt->ipo); // XXX deprecated - old animation system
@@ -4724,16 +4701,12 @@ static void lib_link_modifiers(FileData *fd, Object *ob)
 
 static void lib_link_object(FileData *fd, Main *main)
 {
-	Object *ob;
-	PartEff *paf;
-	bSensor *sens;
-	bController *cont;
-	bActuator *act;
-	void *poin;
-	int warn=0, a;
-	
-	for (ob = main->object.first; ob; ob = ob->id.next) {
+	bool warn = false;
+
+	for (Object *ob = main->object.first; ob; ob = ob->id.next) {
 		if (ob->id.tag & LIB_TAG_NEED_LINK) {
+			int a;
+
 			IDP_LibLinkProperty(ob->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &ob->id, ob->adt);
 			
@@ -4766,17 +4739,17 @@ static void lib_link_object(FileData *fd, Main *main)
 			}
 			ob->proxy_group = newlibadr(fd, ob->id.lib, ob->proxy_group);
 			
-			poin = ob->data;
+			void *poin = ob->data;
 			ob->data = newlibadr_us(fd, ob->id.lib, ob->data);
 			
-			if (ob->data==NULL && poin!=NULL) {
+			if (ob->data == NULL && poin != NULL) {
 				if (ob->id.lib)
 					printf("Can't find obdata of %s lib %s\n", ob->id.name + 2, ob->id.lib->name);
 				else
 					printf("Object %s lost data.\n", ob->id.name + 2);
 				
 				ob->type = OB_EMPTY;
-				warn = 1;
+				warn = true;
 				
 				if (ob->pose) {
 					/* we can't call #BKE_pose_free() here because of library linking
@@ -4822,13 +4795,13 @@ static void lib_link_object(FileData *fd, Main *main)
 			lib_link_nlastrips(fd, &ob->id, &ob->nlastrips);
 // >>> XXX deprecated - old animation system
 			
-			for (paf = ob->effect.first; paf; paf = paf->next) {
+			for (PartEff *paf = ob->effect.first; paf; paf = paf->next) {
 				if (paf->type == EFF_PARTICLE) {
 					paf->group = newlibadr_us(fd, ob->id.lib, paf->group);
 				}
 			}
 			
-			for (sens = ob->sensors.first; sens; sens = sens->next) {
+			for (bSensor *sens = ob->sensors.first; sens; sens = sens->next) {
 				for (a = 0; a < sens->totlinks; a++)
 					sens->links[a] = newglobadr(fd, sens->links[a]);
 
@@ -4839,7 +4812,7 @@ static void lib_link_object(FileData *fd, Main *main)
 				}
 			}
 			
-			for (cont = ob->controllers.first; cont; cont = cont->next) {
+			for (bController *cont = ob->controllers.first; cont; cont = cont->next) {
 				for (a=0; a < cont->totlinks; a++)
 					cont->links[a] = newglobadr(fd, cont->links[a]);
 				
@@ -4851,86 +4824,117 @@ static void lib_link_object(FileData *fd, Main *main)
 				cont->totslinks = 0;
 			}
 			
-			for (act = ob->actuators.first; act; act = act->next) {
-				if (act->type == ACT_SOUND) {
-					bSoundActuator *sa = act->data;
-					sa->sound= newlibadr_us(fd, ob->id.lib, sa->sound);
-				}
-				else if (act->type == ACT_GAME) {
-					/* bGameActuator *ga= act->data; */
-				}
-				else if (act->type == ACT_CAMERA) {
-					bCameraActuator *ca = act->data;
-					ca->ob= newlibadr(fd, ob->id.lib, ca->ob);
-				}
-				/* leave this one, it's obsolete but necessary to read for conversion */
-				else if (act->type == ACT_ADD_OBJECT) {
-					bAddObjectActuator *eoa = act->data;
-					if (eoa) eoa->ob= newlibadr(fd, ob->id.lib, eoa->ob);
-				}
-				else if (act->type == ACT_OBJECT) {
-					bObjectActuator *oa = act->data;
-					if (oa == NULL) {
-						init_actuator(act);
+			for (bActuator *act = ob->actuators.first; act; act = act->next) {
+				switch (act->type) {
+					case ACT_SOUND:
+					{
+						bSoundActuator *sa = act->data;
+						sa->sound = newlibadr_us(fd, ob->id.lib, sa->sound);
+						break;
 					}
-					else {
-						oa->reference = newlibadr(fd, ob->id.lib, oa->reference);
+					case ACT_GAME:
+						/* bGameActuator *ga= act->data; */
+						break;
+					case ACT_CAMERA:
+					{
+						bCameraActuator *ca = act->data;
+						ca->ob = newlibadr(fd, ob->id.lib, ca->ob);
+						break;
 					}
-				}
-				else if (act->type == ACT_EDIT_OBJECT) {
-					bEditObjectActuator *eoa = act->data;
-					if (eoa == NULL) {
-						init_actuator(act);
+					/* leave this one, it's obsolete but necessary to read for conversion */
+					case ACT_ADD_OBJECT:
+					{
+						bAddObjectActuator *eoa = act->data;
+						if (eoa)
+							eoa->ob = newlibadr(fd, ob->id.lib, eoa->ob);
+						break;
 					}
-					else {
-						eoa->ob= newlibadr(fd, ob->id.lib, eoa->ob);
-						eoa->me= newlibadr(fd, ob->id.lib, eoa->me);
+					case ACT_OBJECT:
+					{
+						bObjectActuator *oa = act->data;
+						if (oa == NULL) {
+							init_actuator(act);
+						}
+						else {
+							oa->reference = newlibadr(fd, ob->id.lib, oa->reference);
+						}
+						break;
 					}
-				}
-				else if (act->type == ACT_SCENE) {
-					bSceneActuator *sa = act->data;
-					sa->camera= newlibadr(fd, ob->id.lib, sa->camera);
-					sa->scene= newlibadr(fd, ob->id.lib, sa->scene);
-				}
-				else if (act->type == ACT_ACTION) {
-					bActionActuator *aa = act->data;
-					aa->act= newlibadr_us(fd, ob->id.lib, aa->act);
-				}
-				else if (act->type == ACT_SHAPEACTION) {
-					bActionActuator *aa = act->data;
-					aa->act= newlibadr_us(fd, ob->id.lib, aa->act);
-				}
-				else if (act->type == ACT_PROPERTY) {
-					bPropertyActuator *pa = act->data;
-					pa->ob= newlibadr(fd, ob->id.lib, pa->ob);
-				}
-				else if (act->type == ACT_MESSAGE) {
-					bMessageActuator *ma = act->data;
-					ma->toObject= newlibadr(fd, ob->id.lib, ma->toObject);
-				}
-				else if (act->type == ACT_2DFILTER) {
-					bTwoDFilterActuator *_2dfa = act->data; 
-					_2dfa->text= newlibadr(fd, ob->id.lib, _2dfa->text);
-				}
-				else if (act->type == ACT_PARENT) {
-					bParentActuator *parenta = act->data; 
-					parenta->ob = newlibadr(fd, ob->id.lib, parenta->ob);
-				}
-				else if (act->type == ACT_STATE) {
-					/* bStateActuator *statea = act->data; */
-				}
-				else if (act->type == ACT_ARMATURE) {
-					bArmatureActuator *arma= act->data;
-					arma->target= newlibadr(fd, ob->id.lib, arma->target);
-					arma->subtarget= newlibadr(fd, ob->id.lib, arma->subtarget);
-				}
-				else if (act->type == ACT_STEERING) {
-					bSteeringActuator *steeringa = act->data; 
-					steeringa->target = newlibadr(fd, ob->id.lib, steeringa->target);
-					steeringa->navmesh = newlibadr(fd, ob->id.lib, steeringa->navmesh);
-				}
-				else if (act->type == ACT_MOUSE) {
-					/* bMouseActuator *moa= act->data; */
+					case ACT_EDIT_OBJECT:
+					{
+						bEditObjectActuator *eoa = act->data;
+						if (eoa == NULL) {
+							init_actuator(act);
+						}
+						else {
+							eoa->ob = newlibadr(fd, ob->id.lib, eoa->ob);
+							eoa->me = newlibadr(fd, ob->id.lib, eoa->me);
+						}
+						break;
+					}
+					case ACT_SCENE:
+					{
+						bSceneActuator *sa = act->data;
+						sa->camera = newlibadr(fd, ob->id.lib, sa->camera);
+						sa->scene = newlibadr(fd, ob->id.lib, sa->scene);
+						break;
+					}
+					case ACT_ACTION:
+					{
+						bActionActuator *aa = act->data;
+						aa->act = newlibadr_us(fd, ob->id.lib, aa->act);
+						break;
+					}
+					case ACT_SHAPEACTION:
+					{
+						bActionActuator *aa = act->data;
+						aa->act = newlibadr_us(fd, ob->id.lib, aa->act);
+						break;
+					}
+					case ACT_PROPERTY:
+					{
+						bPropertyActuator *pa = act->data;
+						pa->ob = newlibadr(fd, ob->id.lib, pa->ob);
+						break;
+					}
+					case ACT_MESSAGE:
+					{
+						bMessageActuator *ma = act->data;
+						ma->toObject = newlibadr(fd, ob->id.lib, ma->toObject);
+						break;
+					}
+					case ACT_2DFILTER:
+					{
+						bTwoDFilterActuator *_2dfa = act->data;
+						_2dfa->text = newlibadr(fd, ob->id.lib, _2dfa->text);
+						break;
+					}
+					case ACT_PARENT:
+					{
+						bParentActuator *parenta = act->data;
+						parenta->ob = newlibadr(fd, ob->id.lib, parenta->ob);
+						break;
+					}
+					case ACT_STATE:
+						/* bStateActuator *statea = act->data; */
+						break;
+					case ACT_ARMATURE:
+					{
+						bArmatureActuator *arma= act->data;
+						arma->target = newlibadr(fd, ob->id.lib, arma->target);
+						arma->subtarget = newlibadr(fd, ob->id.lib, arma->subtarget);
+						break;
+					}
+					case ACT_STEERING:
+					{
+						bSteeringActuator *steeringa = act->data;
+						steeringa->target = newlibadr(fd, ob->id.lib, steeringa->target);
+						steeringa->navmesh = newlibadr(fd, ob->id.lib, steeringa->navmesh);
+						break;
+					}
+					case ACT_MOUSE:
+						/* bMouseActuator *moa = act->data; */
+						break;
 				}
 			}
 			
@@ -5317,6 +5321,37 @@ static void direct_link_modifiers(FileData *fd, ListBase *lb)
 			MeshSeqCacheModifierData *msmcd = (MeshSeqCacheModifierData *)md;
 			msmcd->reader = NULL;
 		}
+		else if (md->type == eModifierType_SurfaceDeform) {
+			SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+			smd->verts = newdataadr(fd, smd->verts);
+
+			if (smd->verts) {
+				for (int i = 0; i < smd->numverts; i++) {
+					smd->verts[i].binds = newdataadr(fd, smd->verts[i].binds);
+
+					if (smd->verts[i].binds) {
+						for (int j = 0; j < smd->verts[i].numbinds; j++) {
+							smd->verts[i].binds[j].vert_inds = newdataadr(fd, smd->verts[i].binds[j].vert_inds);
+							smd->verts[i].binds[j].vert_weights = newdataadr(fd, smd->verts[i].binds[j].vert_weights);
+
+							if (fd->flags & FD_FLAGS_SWITCH_ENDIAN) {
+								if (smd->verts[i].binds[j].vert_inds)
+									BLI_endian_switch_uint32_array(smd->verts[i].binds[j].vert_inds, smd->verts[i].binds[j].numverts);
+
+								if (smd->verts[i].binds[j].vert_weights) {
+									if (smd->verts[i].binds[j].mode == MOD_SDEF_MODE_CENTROID ||
+									    smd->verts[i].binds[j].mode == MOD_SDEF_MODE_LOOPTRI)
+										BLI_endian_switch_float_array(smd->verts[i].binds[j].vert_weights, 3);
+									else
+										BLI_endian_switch_float_array(smd->verts[i].binds[j].vert_weights, smd->verts[i].binds[j].numverts);
+								}
+							}
+						}
+					}
+				}
+			}
+		}
 	}
 }
 
@@ -5631,19 +5666,12 @@ static bool scene_validate_setscene__liblink(Scene *sce, const int totscene)
 
 static void lib_link_scene(FileData *fd, Main *main)
 {
-	Scene *sce;
-	Base *base, *next;
-	Sequence *seq;
-	SceneRenderLayer *srl;
-	FreestyleModuleConfig *fmc;
-	FreestyleLineSet *fls;
-
 #ifdef USE_SETSCENE_CHECK
 	bool need_check_set = false;
 	int totscene = 0;
 #endif
 	
-	for (sce = main->scene.first; sce; sce = sce->id.next) {
+	for (Scene *sce = main->scene.first; sce; sce = sce->id.next) {
 		if (sce->id.tag & LIB_TAG_NEED_LINK) {
 			/* Link ID Properties -- and copy this comment EXACTLY for easy finding
 			 * of library blocks that implement this.*/
@@ -5683,7 +5711,7 @@ static void lib_link_scene(FileData *fd, Main *main)
 			
 			sce->toolsettings->particle.shape_object = newlibadr(fd, sce->id.lib, sce->toolsettings->particle.shape_object);
 			
-			for (base = sce->base.first; base; base = next) {
+			for (Base *next, *base = sce->base.first; base; base = next) {
 				next = base->next;
 				
 				base->object = newlibadr_us(fd, sce->id.lib, base->object);
@@ -5697,6 +5725,7 @@ static void lib_link_scene(FileData *fd, Main *main)
 				}
 			}
 			
+			Sequence *seq;
 			SEQ_BEGIN (sce->ed, seq)
 			{
 				if (seq->ipo) seq->ipo = newlibadr_us(fd, sce->id.lib, seq->ipo);  // XXX deprecated - old animation system
@@ -5762,13 +5791,13 @@ static void lib_link_scene(FileData *fd, Main *main)
 				composite_patch(sce->nodetree, sce);
 			}
 			
-			for (srl = sce->r.layers.first; srl; srl = srl->next) {
+			for (SceneRenderLayer *srl = sce->r.layers.first; srl; srl = srl->next) {
 				srl->mat_override = newlibadr_us(fd, sce->id.lib, srl->mat_override);
 				srl->light_override = newlibadr_us(fd, sce->id.lib, srl->light_override);
-				for (fmc = srl->freestyleConfig.modules.first; fmc; fmc = fmc->next) {
+				for (FreestyleModuleConfig *fmc = srl->freestyleConfig.modules.first; fmc; fmc = fmc->next) {
 					fmc->script = newlibadr(fd, sce->id.lib, fmc->script);
 				}
-				for (fls = srl->freestyleConfig.linesets.first; fls; fls = fls->next) {
+				for (FreestyleLineSet *fls = srl->freestyleConfig.linesets.first; fls; fls = fls->next) {
 					fls->linestyle = newlibadr_us(fd, sce->id.lib, fls->linestyle);
 					fls->group = newlibadr_us(fd, sce->id.lib, fls->group);
 				}
@@ -5802,7 +5831,7 @@ static void lib_link_scene(FileData *fd, Main *main)
 
 #ifdef USE_SETSCENE_CHECK
 	if (need_check_set) {
-		for (sce = main->scene.first; sce; sce = sce->id.next) {
+		for (Scene *sce = main->scene.first; sce; sce = sce->id.next) {
 			if (sce->id.tag & LIB_TAG_NEED_LINK) {
 				sce->id.tag &= ~LIB_TAG_NEED_LINK;
 				if (!scene_validate_setscene__liblink(sce, totscene)) {
@@ -6212,8 +6241,10 @@ static void lib_link_windowmanager(FileData *fd, Main *main)
 	
 	for (wm = main->wm.first; wm; wm = wm->id.next) {
 		if (wm->id.tag & LIB_TAG_NEED_LINK) {
-			for (win = wm->windows.first; win; win = win->next)
+			/* Note: WM IDProperties are never written to file, hence no need to read/link them here. */
+			for (win = wm->windows.first; win; win = win->next) {
 				win->screen = newlibadr(fd, NULL, win->screen);
+			}
 			
 			wm->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
@@ -6225,13 +6256,12 @@ static void lib_link_windowmanager(FileData *fd, Main *main)
 /* relink's grease pencil data's refs */
 static void lib_link_gpencil(FileData *fd, Main *main)
 {
-	bGPdata *gpd;
-	
-	for (gpd = main->gpencil.first; gpd; gpd = gpd->id.next) {
+	for (bGPdata *gpd = main->gpencil.first; gpd; gpd = gpd->id.next) {
 		if (gpd->id.tag & LIB_TAG_NEED_LINK) {
-			gpd->id.tag &= ~LIB_TAG_NEED_LINK;
-			
+			IDP_LibLinkProperty(gpd->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &gpd->id, gpd->adt);
+
+			gpd->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
 }
@@ -6293,12 +6323,11 @@ static void direct_link_gpencil(FileData *fd, bGPdata *gpd)
  * check lib pointers in call below */
 static void lib_link_screen(FileData *fd, Main *main)
 {
-	bScreen *sc;
-	ScrArea *sa;
-	
-	for (sc = main->screen.first; sc; sc = sc->id.next) {
+	for (bScreen *sc = main->screen.first; sc; sc = sc->id.next) {
 		if (sc->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(sc->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			id_us_ensure_real(&sc->id);
+
 			sc->scene = newlibadr(fd, sc->id.lib, sc->scene);
 
 			/* this should not happen, but apparently it does somehow. Until we figure out the cause,
@@ -6309,177 +6338,204 @@ static void lib_link_screen(FileData *fd, Main *main)
 			sc->animtimer = NULL; /* saved in rare cases */
 			sc->scrubbing = false;
 			
-			for (sa = sc->areabase.first; sa; sa = sa->next) {
-				SpaceLink *sl;
-				
+			for (ScrArea *sa = sc->areabase.first; sa; sa = sa->next) {
 				sa->full = newlibadr(fd, sc->id.lib, sa->full);
 				
-				for (sl = sa->spacedata.first; sl; sl= sl->next) {
-					if (sl->spacetype == SPACE_VIEW3D) {
-						View3D *v3d = (View3D*) sl;
-						BGpic *bgpic = NULL;
-						
-						v3d->camera= newlibadr(fd, sc->id.lib, v3d->camera);
-						v3d->ob_centre= newlibadr(fd, sc->id.lib, v3d->ob_centre);
-						
-						/* should be do_versions but not easy adding into the listbase */
-						if (v3d->bgpic) {
-							v3d->bgpic = newlibadr(fd, sc->id.lib, v3d->bgpic);
-							BLI_addtail(&v3d->bgpicbase, bgpic);
-							v3d->bgpic = NULL;
+				for (SpaceLink *sl = sa->spacedata.first; sl; sl= sl->next) {
+					switch (sl->spacetype) {
+						case SPACE_VIEW3D:
+						{
+							View3D *v3d = (View3D*) sl;
+							BGpic *bgpic = NULL;
+
+							v3d->camera= newlibadr(fd, sc->id.lib, v3d->camera);
+							v3d->ob_centre= newlibadr(fd, sc->id.lib, v3d->ob_centre);
+
+							/* should be do_versions but not easy adding into the listbase */
+							if (v3d->bgpic) {
+								v3d->bgpic = newlibadr(fd, sc->id.lib, v3d->bgpic);
+								BLI_addtail(&v3d->bgpicbase, bgpic);
+								v3d->bgpic = NULL;
+							}
+
+							for (bgpic = v3d->bgpicbase.first; bgpic; bgpic = bgpic->next) {
+								bgpic->ima = newlibadr_us(fd, sc->id.lib, bgpic->ima);
+								bgpic->clip = newlibadr_us(fd, sc->id.lib, bgpic->clip);
+							}
+							if (v3d->localvd) {
+								v3d->localvd->camera = newlibadr(fd, sc->id.lib, v3d->localvd->camera);
+							}
+							break;
 						}
-						
-						for (bgpic = v3d->bgpicbase.first; bgpic; bgpic = bgpic->next) {
-							bgpic->ima = newlibadr_us(fd, sc->id.lib, bgpic->ima);
-							bgpic->clip = newlibadr_us(fd, sc->id.lib, bgpic->clip);
+						case SPACE_IPO:
+						{
+							SpaceIpo *sipo = (SpaceIpo *)sl;
+							bDopeSheet *ads = sipo->ads;
+
+							if (ads) {
+								ads->source = newlibadr(fd, sc->id.lib, ads->source);
+								ads->filter_grp = newlibadr(fd, sc->id.lib, ads->filter_grp);
+							}
+							break;
 						}
-						if (v3d->localvd) {
-							v3d->localvd->camera = newlibadr(fd, sc->id.lib, v3d->localvd->camera);
+						case SPACE_BUTS:
+						{
+							SpaceButs *sbuts = (SpaceButs *)sl;
+							sbuts->pinid = newlibadr(fd, sc->id.lib, sbuts->pinid);
+							if (sbuts->pinid == NULL) {
+								sbuts->flag &= ~SB_PIN_CONTEXT;
+							}
+							break;
 						}
-					}
-					else if (sl->spacetype == SPACE_IPO) {
-						SpaceIpo *sipo = (SpaceIpo *)sl;
-						bDopeSheet *ads = sipo->ads;
-						
-						if (ads) {
-							ads->source = newlibadr(fd, sc->id.lib, ads->source);
-							ads->filter_grp = newlibadr(fd, sc->id.lib, ads->filter_grp);
+						case SPACE_FILE:
+							break;
+						case SPACE_ACTION:
+						{
+							SpaceAction *saction = (SpaceAction *)sl;
+							bDopeSheet *ads = &saction->ads;
+
+							if (ads) {
+								ads->source = newlibadr(fd, sc->id.lib, ads->source);
+								ads->filter_grp = newlibadr(fd, sc->id.lib, ads->filter_grp);
+							}
+
+							saction->action = newlibadr(fd, sc->id.lib, saction->action);
+							break;
 						}
-					}
-					else if (sl->spacetype == SPACE_BUTS) {
-						SpaceButs *sbuts = (SpaceButs *)sl;
-						sbuts->pinid = newlibadr(fd, sc->id.lib, sbuts->pinid);
-						if (sbuts->pinid == NULL) {
-							sbuts->flag &= ~SB_PIN_CONTEXT;
+						case SPACE_IMAGE:
+						{
+							SpaceImage *sima = (SpaceImage *)sl;
+
+							sima->image = newlibadr_real_us(fd, sc->id.lib, sima->image);
+							sima->mask_info.mask = newlibadr_real_us(fd, sc->id.lib, sima->mask_info.mask);
+
+							/* NOTE: pre-2.5, this was local data not lib data, but now we need this as lib data
+							 * so fingers crossed this works fine!
+							 */
+							sima->gpd = newlibadr_us(fd, sc->id.lib, sima->gpd);
+							break;
 						}
-					}
-					else if (sl->spacetype == SPACE_FILE) {
-						;
-					}
-					else if (sl->spacetype == SPACE_ACTION) {
-						SpaceAction *saction = (SpaceAction *)sl;
-						bDopeSheet *ads = &saction->ads;
-						
-						if (ads) {
-							ads->source = newlibadr(fd, sc->id.lib, ads->source);
-							ads->filter_grp = newlibadr(fd, sc->id.lib, ads->filter_grp);
+						case SPACE_SEQ:
+						{
+							SpaceSeq *sseq = (SpaceSeq *)sl;
+
+							/* NOTE: pre-2.5, this was local data not lib data, but now we need this as lib data
+							 * so fingers crossed this works fine!
+							 */
+							sseq->gpd = newlibadr_us(fd, sc->id.lib, sseq->gpd);
+							break;
 						}
-						
-						saction->action = newlibadr(fd, sc->id.lib, saction->action);
-					}
-					else if (sl->spacetype == SPACE_IMAGE) {
-						SpaceImage *sima = (SpaceImage *)sl;
-						
-						sima->image = newlibadr_real_us(fd, sc->id.lib, sima->image);
-						sima->mask_info.mask = newlibadr_real_us(fd, sc->id.lib, sima->mask_info.mask);
-
-						/* NOTE: pre-2.5, this was local data not lib data, but now we need this as lib data
-						 * so fingers crossed this works fine!
-						 */
-						sima->gpd = newlibadr_us(fd, sc->id.lib, sima->gpd);
-					}
-					else if (sl->spacetype == SPACE_SEQ) {
-						SpaceSeq *sseq = (SpaceSeq *)sl;
-						
-						/* NOTE: pre-2.5, this was local data not lib data, but now we need this as lib data
-						 * so fingers crossed this works fine!
-						 */
-						sseq->gpd = newlibadr_us(fd, sc->id.lib, sseq->gpd);
+						case SPACE_NLA:
+						{
+							SpaceNla *snla= (SpaceNla *)sl;
+							bDopeSheet *ads= snla->ads;
+
+							if (ads) {
+								ads->source = newlibadr(fd, sc->id.lib, ads->source);
+								ads->filter_grp = newlibadr(fd, sc->id.lib, ads->filter_grp);
+							}
+							break;
+						}
+						case SPACE_TEXT:
+						{
+							SpaceText *st= (SpaceText *)sl;
 
-					}
-					else if (sl->spacetype == SPACE_NLA) {
-						SpaceNla *snla= (SpaceNla *)sl;
-						bDopeSheet *ads= snla->ads;
-						
-						if (ads) {
-							ads->source = newlibadr(fd, sc->id.lib, ads->source);
-							ads->filter_grp = newlibadr(fd, sc->id.lib, ads->filter_grp);
+							st->text= newlibadr(fd, sc->id.lib, st->text);
+							break;
 						}
-					}
-					else if (sl->spacetype == SPACE_TEXT) {
-						SpaceText *st= (SpaceText *)sl;
-						
-						st->text= newlibadr(fd, sc->id.lib, st->text);
-					}
-					else if (sl->spacetype == SPACE_SCRIPT) {
-						SpaceScript *scpt = (SpaceScript *)sl;
-						/*scpt->script = NULL; - 2.45 set to null, better re-run the script */
-						if (scpt->script) {
-							scpt->script = newlibadr(fd, sc->id.lib, scpt->script);
+						case SPACE_SCRIPT:
+						{
+							SpaceScript *scpt = (SpaceScript *)sl;
+							/*scpt->script = NULL; - 2.45 set to null, better re-run the script */
 							if (scpt->script) {
-								SCRIPT_SET_NULL(scpt->script);
+								scpt->script = newlibadr(fd, sc->id.lib, scpt->script);
+								if (scpt->script) {
+									SCRIPT_SET_NULL(scpt->script);
+								}
 							}
+							break;
 						}
-					}
-					else if (sl->spacetype == SPACE_OUTLINER) {
-						SpaceOops *so= (SpaceOops *)sl;
-						so->search_tse.id = newlibadr(fd, NULL, so->search_tse.id);
-						
-						if (so->treestore) {
-							TreeStoreElem *tselem;
-							BLI_mempool_iter iter;
+						case SPACE_OUTLINER:
+						{
+							SpaceOops *so= (SpaceOops *)sl;
+							so->search_tse.id = newlibadr(fd, NULL, so->search_tse.id);
+
+							if (so->treestore) {
+								TreeStoreElem *tselem;
+								BLI_mempool_iter iter;
+
+								BLI_mempool_iternew(so->treestore, &iter);
+								while ((tselem = BLI_mempool_iterstep(&iter))) {
+									tselem->id = newlibadr(fd, NULL, tselem->id);
+								}
+								if (so->treehash) {
+									/* rebuild hash table, because it depends on ids too */
+									so->storeflag |= SO_TREESTORE_REBUILD;
+								}
+							}
+							break;
+						}
+						case SPACE_NODE:
+						{
+							SpaceNode *snode = (SpaceNode *)sl;
+							bNodeTreePath *path, *path_next;
+							bNodeTree *ntree;
+
+							/* node tree can be stored locally in id too, link this first */
+							snode->id = newlibadr(fd, sc->id.lib, snode->id);
+							snode->from = newlibadr(fd, sc->id.lib, snode->from);
+
+							ntree = snode->id ? ntreeFromID(snode->id) : NULL;
+							snode->nodetree = ntree ? ntree : newlibadr_us(fd, sc->id.lib, snode->nodetree);
+
+							for (path = snode->treepath.first; path; path = path->next) {
+								if (path == snode->treepath.first) {
+									/* first nodetree in path is same as snode->nodetree */
+									path->nodetree = snode->nodetree;
+								}
+								else
+									path->nodetree = newlibadr_us(fd, sc->id.lib, path->nodetree);
 
-							BLI_mempool_iternew(so->treestore, &iter);
-							while ((tselem = BLI_mempool_iterstep(&iter))) {
-								tselem->id = newlibadr(fd, NULL, tselem->id);
+								if (!path->nodetree)
+									break;
 							}
-							if (so->treehash) {
-								/* rebuild hash table, because it depends on ids too */
-								so->storeflag |= SO_TREESTORE_REBUILD;
+
+							/* remaining path entries are invalid, remove */
+							for (; path; path = path_next) {
+								path_next = path->next;
+
+								BLI_remlink(&snode->treepath, path);
+								MEM_freeN(path);
 							}
-						}
-					}
-					else if (sl->spacetype == SPACE_NODE) {
-						SpaceNode *snode = (SpaceNode *)sl;
-						bNodeTreePath *path, *path_next;
-						bNodeTree *ntree;
-						
-						/* node tree can be stored locally in id too, link this first */
-						snode->id = newlibadr(fd, sc->id.lib, snode->id);
-						snode->from = newlibadr(fd, sc->id.lib, snode->from);
-						
-						ntree = snode->id ? ntreeFromID(snode->id) : NULL;
-						snode->nodetree = ntree ? ntree : newlibadr_us(fd, sc->id.lib, snode->nodetree);
-						
-						for (path = snode->treepath.first; path; path = path->next) {
-							if (path == snode->treepath.first) {
-								/* first nodetree in path is same as snode->nodetree */
-								path->nodetree = snode->nodetree;
+
+							/* edittree is just the last in the path,
+							 * set this directly since the path may have been shortened above */
+							if (snode->treepath.last) {
+								path = snode->treepath.last;
+								snode->edittree = path->nodetree;
 							}
-							else
-								path->nodetree = newlibadr_us(fd, sc->id.lib, path->nodetree);
-							
-							if (!path->nodetree)
-								break;
+							else {
+								snode->edittree = NULL;
+							}
+							break;
 						}
-						
-						/* remaining path entries are invalid, remove */
-						for (; path; path = path_next) {
-							path_next = path->next;
-							
-							BLI_remlink(&snode->treepath, path);
-							MEM_freeN(path);
+						case SPACE_CLIP:
+						{
+							SpaceClip *sclip = (SpaceClip *)sl;
+
+							sclip->clip = newlibadr_real_us(fd, sc->id.lib, sclip->clip);
+							sclip->mask_info.mask = newlibadr_real_us(fd, sc->id.lib, sclip->mask_info.mask);
+							break;
 						}
-						
-						/* edittree is just the last in the path,
-						 * set this directly since the path may have been shortened above */
-						if (snode->treepath.last) {
-							path = snode->treepath.last;
-							snode->edittree = path->nodetree;
+						case SPACE_LOGIC:
+						{
+							SpaceLogic *slogic = (SpaceLogic *)sl;
+
+							slogic->gpd = newlibadr_us(fd, sc->id.lib, slogic->gpd);
+							break;
 						}
-						else
-							snode->edittree = NULL;
-					}
-					else if (sl->spacetype == SPACE_CLIP) {
-						SpaceClip *sclip = (SpaceClip *)sl;
-						
-						sclip->clip = newlibadr_real_us(fd, sc->id.lib, sclip->clip);
-						sclip->mask_info.mask = newlibadr_real_us(fd, sc->id.lib, sclip->mask_info.mask);
-					}
-					else if (sl->spacetype == SPACE_LOGIC) {
-						SpaceLogic *slogic = (SpaceLogic *)sl;
-						
-						slogic->gpd = newlibadr_us(fd, sc->id.lib, slogic->gpd);
+						default:
+							break;
 					}
 				}
 			}
@@ -7353,13 +7409,13 @@ static void fix_relpaths_library(const char *basepath, Main *main)
 
 static void lib_link_speaker(FileData *fd, Main *main)
 {
-	Speaker *spk;
-	
-	for (spk = main->speaker.first; spk; spk = spk->id.next) {
+	for (Speaker *spk = main->speaker.first; spk; spk = spk->id.next) {
 		if (spk->id.tag & LIB_TAG_NEED_LINK) {
+			IDP_LibLinkProperty(spk->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &spk->id, spk->adt);
 			
 			spk->sound = newlibadr_us(fd, spk->id.lib, spk->sound);
+
 			spk->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
@@ -7409,14 +7465,15 @@ static void direct_link_sound(FileData *fd, bSound *sound)
 
 static void lib_link_sound(FileData *fd, Main *main)
 {
-	bSound *sound;
-	
-	for (sound = main->sound.first; sound; sound = sound->id.next) {
+	for (bSound *sound = main->sound.first; sound; sound = sound->id.next) {
 		if (sound->id.tag & LIB_TAG_NEED_LINK) {
-			sound->id.tag &= ~LIB_TAG_NEED_LINK;
+			IDP_LibLinkProperty(sound->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
+
 			sound->ipo = newlibadr_us(fd, sound->id.lib, sound->ipo); // XXX deprecated - old animation system
 			
 			BKE_sound_load(main, sound);
+
+			sound->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
 }
@@ -7431,17 +7488,13 @@ static void direct_link_group(FileData *fd, Group *group)
 
 static void lib_link_group(FileData *fd, Main *main)
 {
-	Group *group;
-	GroupObject *go;
-	bool add_us;
-	
-	for (group = main->group.first; group; group = group->id.next) {
+	for (Group *group = main->group.first; group; group = group->id.next) {
 		if (group->id.tag & LIB_TAG_NEED_LINK) {
-			group->id.tag &= ~LIB_TAG_NEED_LINK;
+			IDP_LibLinkProperty(group->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			
-			add_us = false;
+			bool add_us = false;
 			
-			for (go = group->gobject.first; go; go = go->next) {
+			for (GroupObject *go = group->gobject.first; go; go = go->next) {
 				go->ob = newlibadr_real_us(fd, group->id.lib, go->ob);
 				if (go->ob) {
 					go->ob->flag |= OB_FROMGROUP;
@@ -7453,6 +7506,8 @@ static void lib_link_group(FileData *fd, Main *main)
 				id_us_ensure_real(&group->id);
 			}
 			BKE_group_object_unlink(group, NULL, NULL, NULL);	/* removes NULL entries */
+
+			group->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
 	}
 }
@@ -7557,13 +7612,11 @@ static void lib_link_moviePlaneTracks(FileData *fd, MovieClip *clip, ListBase *t
 
 static void lib_link_movieclip(FileData *fd, Main *main)
 {
-	MovieClip *clip;
-	
-	for (clip = main->movieclip.first; clip; clip = clip->id.next) {
+	for (MovieClip *clip = main->movieclip.first; clip; clip = clip->id.next) {
 		if (clip->id.tag & LIB_TAG_NEED_LINK) {
 			MovieTracking *tracking = &clip->tracking;
-			MovieTrackingObject *object;
 
+			IDP_LibLinkProperty(clip->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &clip->id, clip->adt);
 			
 			clip->gpd = newlibadr_us(fd, clip->id.lib, clip->gpd);
@@ -7571,7 +7624,7 @@ static void lib_link_movieclip(FileData *fd, Main *main)
 			lib_link_movieTracks(fd, clip, &tracking->tracks);
 			lib_link_moviePlaneTracks(fd, clip, &tracking->plane_tracks);
 
-			for (object = tracking->objects.first; object; object = object->next) {
+			for (MovieTrackingObject *object = tracking->objects.first; object; object = object->next) {
 				lib_link_movieTracks(fd, clip, &object->tracks);
 				lib_link_moviePlaneTracks(fd, clip, &object->plane_tracks);
 			}
@@ -7648,16 +7701,12 @@ static void lib_link_mask_parent(FileData *fd, Mask *mask, MaskParent *parent)
 
 static void lib_link_mask(FileData *fd, Main *main)
 {
-	Mask *mask;
-
-	mask = main->mask.first;
-	while (mask) {
+	for (Mask *mask = main->mask.first; mask; mask = mask->id.next) {
 		if (mask->id.tag & LIB_TAG_NEED_LINK) {
-			MaskLayer *masklay;
-
+			IDP_LibLinkProperty(mask->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &mask->id, mask->adt);
 
-			for (masklay = mask->masklayers.first; masklay; masklay = masklay->next) {
+			for (MaskLayer *masklay = mask->masklayers.first; masklay; masklay = masklay->next) {
 				MaskSpline *spline;
 
 				spline = masklay->splines.first;
@@ -7678,7 +7727,6 @@ static void lib_link_mask(FileData *fd, Main *main)
 
 			mask->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
-		mask = mask->id.next;
 	}
 }
 
@@ -7686,18 +7734,13 @@ static void lib_link_mask(FileData *fd, Main *main)
 
 static void lib_link_linestyle(FileData *fd, Main *main)
 {
-	FreestyleLineStyle *linestyle;
-	LineStyleModifier *m;
-	MTex *mtex;
-	int a;
-
-	linestyle = main->linestyle.first;
-	while (linestyle) {
+	for (FreestyleLineStyle *linestyle = main->linestyle.first; linestyle; linestyle = linestyle->id.next) {
 		if (linestyle->id.tag & LIB_TAG_NEED_LINK) {
-			linestyle->id.tag &= ~LIB_TAG_NEED_LINK;
+			LineStyleModifier *m;
 
 			IDP_LibLinkProperty(linestyle->id.properties, (fd->flags & FD_FLAGS_SWITCH_ENDIAN), fd);
 			lib_link_animdata(fd, &linestyle->id, linestyle->adt);
+
 			for (m = linestyle->color_modifiers.first; m; m = m->next) {
 				switch (m->type) {
 				case LS_MODIFIER_DISTANCE_FROM_OBJECT:
@@ -7728,8 +7771,8 @@ static void lib_link_linestyle(FileData *fd, Main *main)
 					break;
 				}
 			}
-			for (a=0; a < MAX_MTEX; a++) {
-				mtex = linestyle->mtex[a];
+			for (int a = 0; a < MAX_MTEX; a++) {
+				MTex *mtex = linestyle->mtex[a];
 				if (mtex) {
 					mtex->tex = newlibadr_us(fd, linestyle->id.lib, mtex->tex);
 					mtex->object = newlibadr(fd, linestyle->id.lib, mtex->object);
@@ -7739,8 +7782,9 @@ static void lib_link_linestyle(FileData *fd, Main *main)
 				lib_link_ntree(fd, &linestyle->id, linestyle->nodetree);
 				linestyle->nodetree->id.lib = linestyle->id.lib;
 			}
+
+			linestyle->id.tag &= ~LIB_TAG_NEED_LINK;
 		}
-		linestyle = linestyle->id.next;
 	}
 }
 
@@ -8411,12 +8455,13 @@ static void lib_link_all(FileData *fd, Main *main)
 	lib_link_screen(fd, main);
 	lib_link_scene(fd, main);
 	lib_link_object(fd, main);
+	lib_link_mesh(fd, main);
 	lib_link_curve(fd, main);
 	lib_link_mball(fd, main);
 	lib_link_material(fd, main);
 	lib_link_texture(fd, main);
 	lib_link_image(fd, main);
-	lib_link_ipo(fd, main);		// XXX deprecated... still needs to be maintained for version patches still
+	lib_link_ipo(fd, main);        /* XXX deprecated... still needs to be maintained for version patches still */
 	lib_link_key(fd, main);
 	lib_link_world(fd, main);
 	lib_link_lamp(fd, main);
@@ -8429,7 +8474,7 @@ static void lib_link_all(FileData *fd, Main *main)
 	lib_link_armature(fd, main);
 	lib_link_action(fd, main);
 	lib_link_vfont(fd, main);
-	lib_link_nodetree(fd, main);	/* has to be done after scene/materials, this will verify group nodes */
+	lib_link_nodetree(fd, main);   /* has to be done after scene/materials, this will verify group nodes */
 	lib_link_brush(fd, main);
 	lib_link_palette(fd, main);
 	lib_link_paint_curve(fd, main);
@@ -8440,9 +8485,7 @@ static void lib_link_all(FileData *fd, Main *main)
 	lib_link_gpencil(fd, main);
 	lib_link_cachefiles(fd, main);
 
-	lib_link_mesh(fd, main);		/* as last: tpage images with users at zero */
-	
-	lib_link_library(fd, main);		/* only init users */
+	lib_link_library(fd, main);    /* only init users */
 }
 
 static void direct_link_keymapitem(FileData *fd, wmKeyMapItem *kmi)
@@ -8566,7 +8609,12 @@ BlendFileData *blo_read_file_internal(FileData *fd, const char *filepath)
 			bhead = read_global(bfd, fd, bhead);
 			break;
 		case USER:
-			bhead = read_userdef(bfd, fd, bhead);
+			if (fd->skip_flags & BLO_READ_SKIP_USERDEF) {
+				bhead = blo_nextbhead(fd, bhead);
+			}
+			else {
+				bhead = read_userdef(bfd, fd, bhead);
+			}
 			break;
 		case ENDB:
 			bhead = NULL;
@@ -8575,15 +8623,24 @@ BlendFileData *blo_read_file_internal(FileData *fd, const char *filepath)
 		case ID_ID:
 			/* Always adds to the most recently loaded ID_LI block, see direct_link_library.
 			 * This is part of the file format definition. */
-			bhead = read_libblock(fd, mainlist.last, bhead, LIB_TAG_READ | LIB_TAG_EXTERN, NULL);
+			if (fd->skip_flags & BLO_READ_SKIP_DATA) {
+				bhead = blo_nextbhead(fd, bhead);
+			}
+			else {
+				bhead = read_libblock(fd, mainlist.last, bhead, LIB_TAG_READ | LIB_TAG_EXTERN, NULL);
+			}
 			break;
-			
 			/* in 2.50+ files, the file identifier for screens is patched, forward compatibility */
 		case ID_SCRN:
 			bhead->code = ID_SCR;
 			/* deliberate pass on to default */
 		default:
-			bhead = read_libblock(fd, bfd->main, bhead, LIB_TAG_LOCAL, NULL);
+			if (fd->skip_flags & BLO_READ_SKIP_DATA) {
+				bhead = blo_nextbhead(fd, bhead);
+			}
+			else {
+				bhead = read_libblock(fd, bfd->main, bhead, LIB_TAG_LOCAL, NULL);
+			}
 		}
 	}
 	
@@ -10395,6 +10452,9 @@ static void read_libraries(FileData *basefd, ListBase *mainlist)
 					else {
 						mainptr->curlib->filedata = NULL;
 						mainptr->curlib->id.tag |= LIB_TAG_MISSING;
+						/* Set lib version to current main one... Makes assert later happy. */
+						mainptr->versionfile = mainptr->curlib->versionfile = mainl->versionfile;
+						mainptr->subversionfile = mainptr->curlib->subversionfile = mainl->subversionfile;
 					}
 					
 					if (fd == NULL) {
diff --git a/source/blender/blenloader/intern/readfile.h b/source/blender/blenloader/intern/readfile.h
index d97bef13a78..62ce15a640e 100644
--- a/source/blender/blenloader/intern/readfile.h
+++ b/source/blender/blenloader/intern/readfile.h
@@ -81,6 +81,8 @@ typedef struct FileData {
 	int id_name_offs;       /* used to retrieve ID names from (bhead+1) */
 	int globalf, fileflags; /* for do_versions patching */
 	
+	eBLOReadSkip skip_flags;  /* skip some data-blocks */
+
 	struct OldNewMap *datamap;
 	struct OldNewMap *globmap;
 	struct OldNewMap *libmap;
diff --git a/source/blender/blenloader/intern/versioning_270.c b/source/blender/blenloader/intern/versioning_270.c
index 6235ebf28ee..d3f33cf725f 100644
--- a/source/blender/blenloader/intern/versioning_270.c
+++ b/source/blender/blenloader/intern/versioning_270.c
@@ -1575,6 +1575,40 @@ void blo_do_versions_270(FileData *fd, Library *UNUSED(lib), Main *main)
 			}
 		}
 
+		/* Fix for T50736, Glare comp node using same var for two different things. */
+		if (!DNA_struct_elem_find(fd->filesdna, "NodeGlare", "char", "star_45")) {
+			FOREACH_NODETREE(main, ntree, id) {
+				if (ntree->type == NTREE_COMPOSIT) {
+					ntreeSetTypes(NULL, ntree);
+					for (bNode *node = ntree->nodes.first; node; node = node->next) {
+						if (node->type == CMP_NODE_GLARE) {
+							NodeGlare *ndg = node->storage;
+							switch (ndg->type) {
+								case 2:  /* Grrrr! magic numbers :( */
+									ndg->streaks = ndg->angle;
+									break;
+								case 0:
+									ndg->star_45 = ndg->angle != 0;
+									break;
+								default:
+									break;
+							}
+						}
+					}
+				}
+			} FOREACH_NODETREE_END
+		}
+
+		if (!DNA_struct_elem_find(fd->filesdna, "SurfaceDeformModifierData", "float", "mat[4][4]")) {
+			for (Object *ob = main->object.first; ob; ob = ob->id.next) {
+				for (ModifierData *md = ob->modifiers.first; md; md = md->next) {
+					if (md->type == eModifierType_SurfaceDeform) {
+						SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+						unit_m4(smd->mat);
+					}
+				}
+			}
+		}
 	}
 }
 
diff --git a/source/blender/blenloader/intern/versioning_defaults.c b/source/blender/blenloader/intern/versioning_defaults.c
index 99d9e140481..e34f12b1cf9 100644
--- a/source/blender/blenloader/intern/versioning_defaults.c
+++ b/source/blender/blenloader/intern/versioning_defaults.c
@@ -68,6 +68,18 @@ void BLO_update_defaults_userpref_blend(void)
 	 * but take care since some hardware has driver bugs here (T46962).
 	 * Further hardware workarounds should be made in gpu_extensions.c */
 	U.glalphaclip = (1.0f / 255);
+
+	/* default so DPI is detected automatically */
+	U.dpi = 0;
+	U.ui_scale = 1.0f;
+
+#ifdef WITH_PYTHON_SECURITY
+	/* use alternative setting for security nuts
+	 * otherwise we'd need to patch the binary blob - startup.blend.c */
+	U.flag |= USER_SCRIPT_AUTOEXEC_DISABLE;
+#else
+	U.flag &= ~USER_SCRIPT_AUTOEXEC_DISABLE;
+#endif
 }
 
 /**
diff --git a/source/blender/blenloader/intern/writefile.c b/source/blender/blenloader/intern/writefile.c
index 7b8b95f0005..57be237be6f 100644
--- a/source/blender/blenloader/intern/writefile.c
+++ b/source/blender/blenloader/intern/writefile.c
@@ -78,7 +78,7 @@
  * - write #TEST (#RenderInfo struct. 128x128 blend file preview is optional).
  * - write #GLOB (#FileGlobal struct) (some global vars).
  * - write #DNA1 (#SDNA struct)
- * - write #USER (#UserDef struct) if filename is ``~/X.XX/config/startup.blend``.
+ * - write #USER (#UserDef struct) if filename is ``~/.config/blender/X.XX/config/startup.blend``.
  */
 
 
@@ -797,30 +797,22 @@ static void write_fcurves(WriteData *wd, ListBase *fcurves)
 	}
 }
 
-static void write_actions(WriteData *wd, ListBase *idbase)
+static void write_action(WriteData *wd, bAction *act)
 {
-	bAction *act;
-	bActionGroup *grp;
-	TimeMarker *marker;
-
-	for (act = idbase->first; act; act = act->id.next) {
-		if (act->id.us > 0 || wd->current) {
-			writestruct(wd, ID_AC, bAction, 1, act);
-			write_iddata(wd, &act->id);
+	if (act->id.us > 0 || wd->current) {
+		writestruct(wd, ID_AC, bAction, 1, act);
+		write_iddata(wd, &act->id);
 
-			write_fcurves(wd, &act->curves);
+		write_fcurves(wd, &act->curves);
 
-			for (grp = act->groups.first; grp; grp = grp->next) {
-				writestruct(wd, DATA, bActionGroup, 1, grp);
-			}
+		for (bActionGroup *grp = act->groups.first; grp; grp = grp->next) {
+			writestruct(wd, DATA, bActionGroup, 1, grp);
+		}
 
-			for (marker = act->markers.first; marker; marker = marker->next) {
-				writestruct(wd, DATA, TimeMarker, 1, marker);
-			}
+		for (TimeMarker *marker = act->markers.first; marker; marker = marker->next) {
+			writestruct(wd, DATA, TimeMarker, 1, marker);
 		}
 	}
-
-	mywrite_flush(wd);
 }
 
 static void write_keyingsets(WriteData *wd, ListBase *list)
@@ -969,7 +961,7 @@ static void write_node_socket_interface(WriteData *wd, bNodeTree *UNUSED(ntree),
 	}
 }
 /* this is only direct data, tree itself should have been written */
-static void write_nodetree(WriteData *wd, bNodeTree *ntree)
+static void write_nodetree_nolib(WriteData *wd, bNodeTree *ntree)
 {
 	bNode *node;
 	bNodeSocket *sock;
@@ -1030,6 +1022,25 @@ static void write_nodetree(WriteData *wd, bNodeTree *ntree)
 			{
 				/* pass */
 			}
+			else if ((ntree->type == NTREE_COMPOSIT) && (node->type == CMP_NODE_GLARE)) {
+				/* Simple forward compat for fix for T50736.
+				 * Not ideal (there is no ideal solution here), but should do for now. */
+				NodeGlare *ndg = node->storage;
+				/* Not in undo case. */
+				if (!wd->current) {
+					switch (ndg->type) {
+						case 2:  /* Grrrr! magic numbers :( */
+							ndg->angle = ndg->streaks;
+							break;
+						case 0:
+							ndg->angle = ndg->star_45;
+							break;
+						default:
+							break;
+					}
+				}
+				writestruct_id(wd, DATA, node->typeinfo->storagename, 1, node->storage);
+			}
 			else {
 				writestruct_id(wd, DATA, node->typeinfo->storagename, 1, node->storage);
 			}
@@ -1278,68 +1289,60 @@ static void write_pointcaches(WriteData *wd, ListBase *ptcaches)
 		}
 	}
 }
-static void write_particlesettings(WriteData *wd, ListBase *idbase)
-{
-	ParticleSettings *part;
-	ParticleDupliWeight *dw;
-	GroupObject *go;
-	int a;
 
-	part = idbase->first;
-	while (part) {
-		if (part->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_PA, ParticleSettings, 1, part);
-			write_iddata(wd, &part->id);
+static void write_particlesettings(WriteData *wd, ParticleSettings *part)
+{
+	if (part->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_PA, ParticleSettings, 1, part);
+		write_iddata(wd, &part->id);
 
-			if (part->adt) {
-				write_animdata(wd, part->adt);
-			}
-			writestruct(wd, DATA, PartDeflect, 1, part->pd);
-			writestruct(wd, DATA, PartDeflect, 1, part->pd2);
-			writestruct(wd, DATA, EffectorWeights, 1, part->effector_weights);
+		if (part->adt) {
+			write_animdata(wd, part->adt);
+		}
+		writestruct(wd, DATA, PartDeflect, 1, part->pd);
+		writestruct(wd, DATA, PartDeflect, 1, part->pd2);
+		writestruct(wd, DATA, EffectorWeights, 1, part->effector_weights);
 
-			if (part->clumpcurve) {
-				write_curvemapping(wd, part->clumpcurve);
-			}
-			if (part->roughcurve) {
-				write_curvemapping(wd, part->roughcurve);
-			}
+		if (part->clumpcurve) {
+			write_curvemapping(wd, part->clumpcurve);
+		}
+		if (part->roughcurve) {
+			write_curvemapping(wd, part->roughcurve);
+		}
 
-			dw = part->dupliweights.first;
-			for (; dw; dw = dw->next) {
-				/* update indices, but only if dw->ob is set (can be NULL after loading e.g.) */
-				if (dw->ob != NULL) {
-					dw->index = 0;
-					if (part->dup_group) { /* can be NULL if lining fails or set to None */
-						for (go = part->dup_group->gobject.first; go && go->ob != dw->ob; go = go->next, dw->index++);
-					}
+		for (ParticleDupliWeight *dw = part->dupliweights.first; dw; dw = dw->next) {
+			/* update indices, but only if dw->ob is set (can be NULL after loading e.g.) */
+			if (dw->ob != NULL) {
+				dw->index = 0;
+				if (part->dup_group) { /* can be NULL if lining fails or set to None */
+					for (GroupObject *go = part->dup_group->gobject.first;
+						 go && go->ob != dw->ob;
+						 go = go->next, dw->index++);
 				}
-				writestruct(wd, DATA, ParticleDupliWeight, 1, dw);
 			}
+			writestruct(wd, DATA, ParticleDupliWeight, 1, dw);
+		}
 
-			if (part->boids && part->phystype == PART_PHYS_BOIDS) {
-				BoidState *state = part->boids->states.first;
-
-				writestruct(wd, DATA, BoidSettings, 1, part->boids);
+		if (part->boids && part->phystype == PART_PHYS_BOIDS) {
+			writestruct(wd, DATA, BoidSettings, 1, part->boids);
 
-				for (; state; state = state->next) {
-					write_boid_state(wd, state);
-				}
-			}
-			if (part->fluid && part->phystype == PART_PHYS_FLUID) {
-				writestruct(wd, DATA, SPHFluidSettings, 1, part->fluid);
+			for (BoidState *state = part->boids->states.first; state; state = state->next) {
+				write_boid_state(wd, state);
 			}
+		}
+		if (part->fluid && part->phystype == PART_PHYS_FLUID) {
+			writestruct(wd, DATA, SPHFluidSettings, 1, part->fluid);
+		}
 
-			for (a = 0; a < MAX_MTEX; a++) {
-				if (part->mtex[a]) {
-					writestruct(wd, DATA, MTex, 1, part->mtex[a]);
-				}
+		for (int a = 0; a < MAX_MTEX; a++) {
+			if (part->mtex[a]) {
+				writestruct(wd, DATA, MTex, 1, part->mtex[a]);
 			}
 		}
-		part = part->id.next;
 	}
 }
+
 static void write_particlesystems(WriteData *wd, ListBase *particles)
 {
 	ParticleSystem *psys = particles->first;
@@ -1830,242 +1833,207 @@ static void write_modifiers(WriteData *wd, ListBase *modbase)
 				writedata(wd, DATA, sizeof(float[3]) * csmd->bind_coords_num, csmd->bind_coords);
 			}
 		}
-	}
-}
+		else if (md->type == eModifierType_SurfaceDeform) {
+			SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
 
-static void write_objects(WriteData *wd, ListBase *idbase)
-{
-	Object *ob;
+			writestruct(wd, DATA, SDefVert, smd->numverts, smd->verts);
 
-	ob = idbase->first;
-	while (ob) {
-		if (ob->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_OB, Object, 1, ob);
-			write_iddata(wd, &ob->id);
+			if (smd->verts) {
+				for (int i = 0; i < smd->numverts; i++) {
+					writestruct(wd, DATA, SDefBind, smd->verts[i].numbinds, smd->verts[i].binds);
 
-			if (ob->adt) {
-				write_animdata(wd, ob->adt);
-			}
+					if (smd->verts[i].binds) {
+						for (int j = 0; j < smd->verts[i].numbinds; j++) {
+							writedata(wd, DATA, sizeof(int) * smd->verts[i].binds[j].numverts, smd->verts[i].binds[j].vert_inds);
 
-			/* direct data */
-			writedata(wd, DATA, sizeof(void *) * ob->totcol, ob->mat);
-			writedata(wd, DATA, sizeof(char) * ob->totcol, ob->matbits);
-			/* write_effects(wd, &ob->effect); */ /* not used anymore */
-			write_properties(wd, &ob->prop);
-			write_sensors(wd, &ob->sensors);
-			write_controllers(wd, &ob->controllers);
-			write_actuators(wd, &ob->actuators);
-
-			if (ob->type == OB_ARMATURE) {
-				bArmature *arm = ob->data;
-				if (arm && ob->pose && arm->act_bone) {
-					BLI_strncpy(ob->pose->proxy_act_bone, arm->act_bone->name, sizeof(ob->pose->proxy_act_bone));
+							if (smd->verts[i].binds[j].mode == MOD_SDEF_MODE_CENTROID ||
+							    smd->verts[i].binds[j].mode == MOD_SDEF_MODE_LOOPTRI)
+							{
+								writedata(wd, DATA, sizeof(float) * 3, smd->verts[i].binds[j].vert_weights);
+							}
+							else {
+								writedata(wd, DATA, sizeof(float) * smd->verts[i].binds[j].numverts, smd->verts[i].binds[j].vert_weights);
+							}
+						}
+					}
 				}
 			}
+		}
+	}
+}
 
-			write_pose(wd, ob->pose);
-			write_defgroups(wd, &ob->defbase);
-			write_constraints(wd, &ob->constraints);
-			write_motionpath(wd, ob->mpath);
+static void write_object(WriteData *wd, Object *ob)
+{
+	if (ob->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_OB, Object, 1, ob);
+		write_iddata(wd, &ob->id);
 
-			writestruct(wd, DATA, PartDeflect, 1, ob->pd);
-			writestruct(wd, DATA, SoftBody, 1, ob->soft);
-			if (ob->soft) {
-				write_pointcaches(wd, &ob->soft->ptcaches);
-				writestruct(wd, DATA, EffectorWeights, 1, ob->soft->effector_weights);
-			}
-			writestruct(wd, DATA, BulletSoftBody, 1, ob->bsoft);
+		if (ob->adt) {
+			write_animdata(wd, ob->adt);
+		}
 
-			if (ob->rigidbody_object) {
-				/* TODO: if any extra data is added to handle duplis, will need separate function then */
-				writestruct(wd, DATA, RigidBodyOb, 1, ob->rigidbody_object);
-			}
-			if (ob->rigidbody_constraint) {
-				writestruct(wd, DATA, RigidBodyCon, 1, ob->rigidbody_constraint);
-			}
+		/* direct data */
+		writedata(wd, DATA, sizeof(void *) * ob->totcol, ob->mat);
+		writedata(wd, DATA, sizeof(char) * ob->totcol, ob->matbits);
+		/* write_effects(wd, &ob->effect); */ /* not used anymore */
+		write_properties(wd, &ob->prop);
+		write_sensors(wd, &ob->sensors);
+		write_controllers(wd, &ob->controllers);
+		write_actuators(wd, &ob->actuators);
 
-			if (ob->type == OB_EMPTY && ob->empty_drawtype == OB_EMPTY_IMAGE) {
-				writestruct(wd, DATA, ImageUser, 1, ob->iuser);
+		if (ob->type == OB_ARMATURE) {
+			bArmature *arm = ob->data;
+			if (arm && ob->pose && arm->act_bone) {
+				BLI_strncpy(ob->pose->proxy_act_bone, arm->act_bone->name, sizeof(ob->pose->proxy_act_bone));
 			}
+		}
 
-			write_particlesystems(wd, &ob->particlesystem);
-			write_modifiers(wd, &ob->modifiers);
+		write_pose(wd, ob->pose);
+		write_defgroups(wd, &ob->defbase);
+		write_constraints(wd, &ob->constraints);
+		write_motionpath(wd, ob->mpath);
 
-			writelist(wd, DATA, LinkData, &ob->pc_ids);
-			writelist(wd, DATA, LodLevel, &ob->lodlevels);
+		writestruct(wd, DATA, PartDeflect, 1, ob->pd);
+		writestruct(wd, DATA, SoftBody, 1, ob->soft);
+		if (ob->soft) {
+			write_pointcaches(wd, &ob->soft->ptcaches);
+			writestruct(wd, DATA, EffectorWeights, 1, ob->soft->effector_weights);
 		}
+		writestruct(wd, DATA, BulletSoftBody, 1, ob->bsoft);
 
-		write_previews(wd, ob->preview);
+		if (ob->rigidbody_object) {
+			/* TODO: if any extra data is added to handle duplis, will need separate function then */
+			writestruct(wd, DATA, RigidBodyOb, 1, ob->rigidbody_object);
+		}
+		if (ob->rigidbody_constraint) {
+			writestruct(wd, DATA, RigidBodyCon, 1, ob->rigidbody_constraint);
+		}
 
-		ob = ob->id.next;
-	}
+		if (ob->type == OB_EMPTY && ob->empty_drawtype == OB_EMPTY_IMAGE) {
+			writestruct(wd, DATA, ImageUser, 1, ob->iuser);
+		}
 
-	mywrite_flush(wd);
-}
+		write_particlesystems(wd, &ob->particlesystem);
+		write_modifiers(wd, &ob->modifiers);
 
+		writelist(wd, DATA, LinkData, &ob->pc_ids);
+		writelist(wd, DATA, LodLevel, &ob->lodlevels);
 
-static void write_vfonts(WriteData *wd, ListBase *idbase)
-{
-	VFont *vf;
-	PackedFile *pf;
+		write_previews(wd, ob->preview);
+	}
+}
 
-	vf = idbase->first;
-	while (vf) {
-		if (vf->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_VF, VFont, 1, vf);
-			write_iddata(wd, &vf->id);
 
-			/* direct data */
+static void write_vfont(WriteData *wd, VFont *vf)
+{
+	if (vf->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_VF, VFont, 1, vf);
+		write_iddata(wd, &vf->id);
 
-			if (vf->packedfile) {
-				pf = vf->packedfile;
-				writestruct(wd, DATA, PackedFile, 1, pf);
-				writedata(wd, DATA, pf->size, pf->data);
-			}
+		/* direct data */
+		if (vf->packedfile) {
+			PackedFile *pf = vf->packedfile;
+			writestruct(wd, DATA, PackedFile, 1, pf);
+			writedata(wd, DATA, pf->size, pf->data);
 		}
-
-		vf = vf->id.next;
 	}
-
-	mywrite_flush(wd);
 }
 
 
-static void write_keys(WriteData *wd, ListBase *idbase)
+static void write_key(WriteData *wd, Key *key)
 {
-	Key *key;
-	KeyBlock *kb;
-
-	key = idbase->first;
-	while (key) {
-		if (key->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_KE, Key, 1, key);
-			write_iddata(wd, &key->id);
-
-			if (key->adt) {
-				write_animdata(wd, key->adt);
-			}
+	if (key->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_KE, Key, 1, key);
+		write_iddata(wd, &key->id);
 
-			/* direct data */
-			kb = key->block.first;
-			while (kb) {
-				writestruct(wd, DATA, KeyBlock, 1, kb);
-				if (kb->data) {
-					writedata(wd, DATA, kb->totelem * key->elemsize, kb->data);
-				}
-				kb = kb->next;
-			}
+		if (key->adt) {
+			write_animdata(wd, key->adt);
 		}
 
-		key = key->id.next;
+		/* direct data */
+		for (KeyBlock *kb = key->block.first; kb; kb = kb->next) {
+			writestruct(wd, DATA, KeyBlock, 1, kb);
+			if (kb->data) {
+				writedata(wd, DATA, kb->totelem * key->elemsize, kb->data);
+			}
+		}
 	}
-
-	mywrite_flush(wd);
 }
 
-static void write_cameras(WriteData *wd, ListBase *idbase)
+static void write_camera(WriteData *wd, Camera *cam)
 {
-	Camera *cam;
-
-	cam = idbase->first;
-	while (cam) {
-		if (cam->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_CA, Camera, 1, cam);
-			write_iddata(wd, &cam->id);
+	if (cam->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_CA, Camera, 1, cam);
+		write_iddata(wd, &cam->id);
 
-			if (cam->adt) {
-				write_animdata(wd, cam->adt);
-			}
+		if (cam->adt) {
+			write_animdata(wd, cam->adt);
 		}
-
-		cam = cam->id.next;
 	}
 }
 
-static void write_mballs(WriteData *wd, ListBase *idbase)
+static void write_mball(WriteData *wd, MetaBall *mb)
 {
-	MetaBall *mb;
-	MetaElem *ml;
-
-	mb = idbase->first;
-	while (mb) {
-		if (mb->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_MB, MetaBall, 1, mb);
-			write_iddata(wd, &mb->id);
+	if (mb->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_MB, MetaBall, 1, mb);
+		write_iddata(wd, &mb->id);
 
-			/* direct data */
-			writedata(wd, DATA, sizeof(void *) * mb->totcol, mb->mat);
-			if (mb->adt) {
-				write_animdata(wd, mb->adt);
-			}
+		/* direct data */
+		writedata(wd, DATA, sizeof(void *) * mb->totcol, mb->mat);
+		if (mb->adt) {
+			write_animdata(wd, mb->adt);
+		}
 
-			ml = mb->elems.first;
-			while (ml) {
-				writestruct(wd, DATA, MetaElem, 1, ml);
-				ml = ml->next;
-			}
+		for (MetaElem *ml = mb->elems.first; ml; ml = ml->next) {
+			writestruct(wd, DATA, MetaElem, 1, ml);
 		}
-		mb = mb->id.next;
 	}
 }
 
-static void write_curves(WriteData *wd, ListBase *idbase)
+static void write_curve(WriteData *wd, Curve *cu)
 {
-	Curve *cu;
-	Nurb *nu;
-
-	cu = idbase->first;
-	while (cu) {
-		if (cu->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_CU, Curve, 1, cu);
-			write_iddata(wd, &cu->id);
+	if (cu->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_CU, Curve, 1, cu);
+		write_iddata(wd, &cu->id);
 
-			/* direct data */
-			writedata(wd, DATA, sizeof(void *) * cu->totcol, cu->mat);
-			if (cu->adt) {
-				write_animdata(wd, cu->adt);
-			}
+		/* direct data */
+		writedata(wd, DATA, sizeof(void *) * cu->totcol, cu->mat);
+		if (cu->adt) {
+			write_animdata(wd, cu->adt);
+		}
 
-			if (cu->vfont) {
-				writedata(wd, DATA, cu->len + 1, cu->str);
-				writestruct(wd, DATA, CharInfo, cu->len_wchar + 1, cu->strinfo);
-				writestruct(wd, DATA, TextBox, cu->totbox, cu->tb);
+		if (cu->vfont) {
+			writedata(wd, DATA, cu->len + 1, cu->str);
+			writestruct(wd, DATA, CharInfo, cu->len_wchar + 1, cu->strinfo);
+			writestruct(wd, DATA, TextBox, cu->totbox, cu->tb);
+		}
+		else {
+			/* is also the order of reading */
+			for (Nurb *nu = cu->nurb.first; nu; nu = nu->next) {
+				writestruct(wd, DATA, Nurb, 1, nu);
 			}
-			else {
-				/* is also the order of reading */
-				nu = cu->nurb.first;
-				while (nu) {
-					writestruct(wd, DATA, Nurb, 1, nu);
-					nu = nu->next;
+			for (Nurb *nu = cu->nurb.first; nu; nu = nu->next) {
+				if (nu->type == CU_BEZIER) {
+					writestruct(wd, DATA, BezTriple, nu->pntsu, nu->bezt);
 				}
-				nu = cu->nurb.first;
-				while (nu) {
-					if (nu->type == CU_BEZIER) {
-						writestruct(wd, DATA, BezTriple, nu->pntsu, nu->bezt);
+				else {
+					writestruct(wd, DATA, BPoint, nu->pntsu * nu->pntsv, nu->bp);
+					if (nu->knotsu) {
+						writedata(wd, DATA, KNOTSU(nu) * sizeof(float), nu->knotsu);
 					}
-					else {
-						writestruct(wd, DATA, BPoint, nu->pntsu * nu->pntsv, nu->bp);
-						if (nu->knotsu) {
-							writedata(wd, DATA, KNOTSU(nu) * sizeof(float), nu->knotsu);
-						}
-						if (nu->knotsv) {
-							writedata(wd, DATA, KNOTSV(nu) * sizeof(float), nu->knotsv);
-						}
+					if (nu->knotsv) {
+						writedata(wd, DATA, KNOTSV(nu) * sizeof(float), nu->knotsv);
 					}
-					nu = nu->next;
 				}
 			}
 		}
-		cu = cu->id.next;
 	}
-
-	mywrite_flush(wd);
 }
 
 static void write_dverts(WriteData *wd, int count, MDeformVert *dvlist)
@@ -2182,407 +2150,349 @@ static void write_customdata(
 	}
 }
 
-static void write_meshes(WriteData *wd, ListBase *idbase)
+static void write_mesh(WriteData *wd, Mesh *mesh)
 {
-	Mesh *mesh;
-	bool save_for_old_blender = false;
-
 #ifdef USE_BMESH_SAVE_AS_COMPAT
-	save_for_old_blender = wd->use_mesh_compat; /* option to save with older mesh format */
+	const bool save_for_old_blender = wd->use_mesh_compat;  /* option to save with older mesh format */
+#else
+	const bool save_for_old_blender = false;
 #endif
 
-	mesh = idbase->first;
-	while (mesh) {
-		CustomDataLayer *vlayers = NULL, vlayers_buff[CD_TEMP_CHUNK_SIZE];
-		CustomDataLayer *elayers = NULL, elayers_buff[CD_TEMP_CHUNK_SIZE];
-		CustomDataLayer *flayers = NULL, flayers_buff[CD_TEMP_CHUNK_SIZE];
-		CustomDataLayer *llayers = NULL, llayers_buff[CD_TEMP_CHUNK_SIZE];
-		CustomDataLayer *players = NULL, players_buff[CD_TEMP_CHUNK_SIZE];
-
-		if (mesh->id.us > 0 || wd->current) {
-			/* write LibData */
-			if (!save_for_old_blender) {
-				/* write a copy of the mesh, don't modify in place because it is
-				 * not thread safe for threaded renders that are reading this */
-				Mesh *old_mesh = mesh;
-				Mesh copy_mesh = *mesh;
-				mesh = &copy_mesh;
+	CustomDataLayer *vlayers = NULL, vlayers_buff[CD_TEMP_CHUNK_SIZE];
+	CustomDataLayer *elayers = NULL, elayers_buff[CD_TEMP_CHUNK_SIZE];
+	CustomDataLayer *flayers = NULL, flayers_buff[CD_TEMP_CHUNK_SIZE];
+	CustomDataLayer *llayers = NULL, llayers_buff[CD_TEMP_CHUNK_SIZE];
+	CustomDataLayer *players = NULL, players_buff[CD_TEMP_CHUNK_SIZE];
+
+	if (mesh->id.us > 0 || wd->current) {
+		/* write LibData */
+		if (!save_for_old_blender) {
+			/* write a copy of the mesh, don't modify in place because it is
+			 * not thread safe for threaded renders that are reading this */
+			Mesh *old_mesh = mesh;
+			Mesh copy_mesh = *mesh;
+			mesh = &copy_mesh;
 
 #ifdef USE_BMESH_SAVE_WITHOUT_MFACE
-				/* cache only - don't write */
-				mesh->mface = NULL;
-				mesh->totface = 0;
-				memset(&mesh->fdata, 0, sizeof(mesh->fdata));
+			/* cache only - don't write */
+			mesh->mface = NULL;
+			mesh->totface = 0;
+			memset(&mesh->fdata, 0, sizeof(mesh->fdata));
 #endif /* USE_BMESH_SAVE_WITHOUT_MFACE */
 
-				/**
-				 * Those calls:
-				 *   - Reduce mesh->xdata.totlayer to number of layers to write.
-				 *   - Fill xlayers with those layers to be written.
-				 * Note that mesh->xdata is from now on invalid for Blender, but this is why the whole mesh is
-				 * a temp local copy!
-				 */
-				CustomData_file_write_prepare(&mesh->vdata, &vlayers, vlayers_buff, ARRAY_SIZE(vlayers_buff));
-				CustomData_file_write_prepare(&mesh->edata, &elayers, elayers_buff, ARRAY_SIZE(elayers_buff));
+			/**
+			 * Those calls:
+			 *   - Reduce mesh->xdata.totlayer to number of layers to write.
+			 *   - Fill xlayers with those layers to be written.
+			 * Note that mesh->xdata is from now on invalid for Blender, but this is why the whole mesh is
+			 * a temp local copy!
+			 */
+			CustomData_file_write_prepare(&mesh->vdata, &vlayers, vlayers_buff, ARRAY_SIZE(vlayers_buff));
+			CustomData_file_write_prepare(&mesh->edata, &elayers, elayers_buff, ARRAY_SIZE(elayers_buff));
 #ifndef USE_BMESH_SAVE_WITHOUT_MFACE  /* Do not copy org fdata in this case!!! */
-				CustomData_file_write_prepare(&mesh->fdata, &flayers, flayers_buff, ARRAY_SIZE(flayers_buff));
+			CustomData_file_write_prepare(&mesh->fdata, &flayers, flayers_buff, ARRAY_SIZE(flayers_buff));
 #else
-				flayers = flayers_buff;
+			flayers = flayers_buff;
 #endif
-				CustomData_file_write_prepare(&mesh->ldata, &llayers, llayers_buff, ARRAY_SIZE(llayers_buff));
-				CustomData_file_write_prepare(&mesh->pdata, &players, players_buff, ARRAY_SIZE(players_buff));
+			CustomData_file_write_prepare(&mesh->ldata, &llayers, llayers_buff, ARRAY_SIZE(llayers_buff));
+			CustomData_file_write_prepare(&mesh->pdata, &players, players_buff, ARRAY_SIZE(players_buff));
 
-				writestruct_at_address(wd, ID_ME, Mesh, 1, old_mesh, mesh);
-				write_iddata(wd, &mesh->id);
+			writestruct_at_address(wd, ID_ME, Mesh, 1, old_mesh, mesh);
+			write_iddata(wd, &mesh->id);
 
-				/* direct data */
-				if (mesh->adt) {
-					write_animdata(wd, mesh->adt);
-				}
+			/* direct data */
+			if (mesh->adt) {
+				write_animdata(wd, mesh->adt);
+			}
 
-				writedata(wd, DATA, sizeof(void *) * mesh->totcol, mesh->mat);
-				writedata(wd, DATA, sizeof(MSelect) * mesh->totselect, mesh->mselect);
+			writedata(wd, DATA, sizeof(void *) * mesh->totcol, mesh->mat);
+			writedata(wd, DATA, sizeof(MSelect) * mesh->totselect, mesh->mselect);
 
-				write_customdata(wd, &mesh->id, mesh->totvert, &mesh->vdata, vlayers, -1, 0);
-				write_customdata(wd, &mesh->id, mesh->totedge, &mesh->edata, elayers, -1, 0);
-				/* fdata is really a dummy - written so slots align */
-				write_customdata(wd, &mesh->id, mesh->totface, &mesh->fdata, flayers, -1, 0);
-				write_customdata(wd, &mesh->id, mesh->totloop, &mesh->ldata, llayers, -1, 0);
-				write_customdata(wd, &mesh->id, mesh->totpoly, &mesh->pdata, players, -1, 0);
+			write_customdata(wd, &mesh->id, mesh->totvert, &mesh->vdata, vlayers, -1, 0);
+			write_customdata(wd, &mesh->id, mesh->totedge, &mesh->edata, elayers, -1, 0);
+			/* fdata is really a dummy - written so slots align */
+			write_customdata(wd, &mesh->id, mesh->totface, &mesh->fdata, flayers, -1, 0);
+			write_customdata(wd, &mesh->id, mesh->totloop, &mesh->ldata, llayers, -1, 0);
+			write_customdata(wd, &mesh->id, mesh->totpoly, &mesh->pdata, players, -1, 0);
 
-				/* restore pointer */
-				mesh = old_mesh;
-			}
-			else {
+			/* restore pointer */
+			mesh = old_mesh;
+		}
+		else {
 
 #ifdef USE_BMESH_SAVE_AS_COMPAT
-				/* write a copy of the mesh, don't modify in place because it is
-				 * not thread safe for threaded renders that are reading this */
-				Mesh *old_mesh = mesh;
-				Mesh copy_mesh = *mesh;
-				mesh = &copy_mesh;
-
-				mesh->mpoly = NULL;
-				mesh->mface = NULL;
-				mesh->totface = 0;
-				mesh->totpoly = 0;
-				mesh->totloop = 0;
-				CustomData_reset(&mesh->fdata);
-				CustomData_reset(&mesh->pdata);
-				CustomData_reset(&mesh->ldata);
-				mesh->edit_btmesh = NULL;
-
-				/* now fill in polys to mfaces */
-				/* XXX This breaks writing design, by using temp allocated memory, which will likely generate
-				 *     duplicates in stored 'old' addresses.
-				 *     This is very bad, but do not see easy way to avoid this, aside from generating those data
-				 *     outside of save process itself.
-				 *     Maybe we can live with this, though?
-				 */
-				mesh->totface = BKE_mesh_mpoly_to_mface(&mesh->fdata, &old_mesh->ldata, &old_mesh->pdata,
-				                                        mesh->totface, old_mesh->totloop, old_mesh->totpoly);
+			/* write a copy of the mesh, don't modify in place because it is
+			 * not thread safe for threaded renders that are reading this */
+			Mesh *old_mesh = mesh;
+			Mesh copy_mesh = *mesh;
+			mesh = &copy_mesh;
+
+			mesh->mpoly = NULL;
+			mesh->mface = NULL;
+			mesh->totface = 0;
+			mesh->totpoly = 0;
+			mesh->totloop = 0;
+			CustomData_reset(&mesh->fdata);
+			CustomData_reset(&mesh->pdata);
+			CustomData_reset(&mesh->ldata);
+			mesh->edit_btmesh = NULL;
+
+			/* now fill in polys to mfaces */
+			/* XXX This breaks writing design, by using temp allocated memory, which will likely generate
+			 *     duplicates in stored 'old' addresses.
+			 *     This is very bad, but do not see easy way to avoid this, aside from generating those data
+			 *     outside of save process itself.
+			 *     Maybe we can live with this, though?
+			 */
+			mesh->totface = BKE_mesh_mpoly_to_mface(&mesh->fdata, &old_mesh->ldata, &old_mesh->pdata,
+													mesh->totface, old_mesh->totloop, old_mesh->totpoly);
 
-				BKE_mesh_update_customdata_pointers(mesh, false);
+			BKE_mesh_update_customdata_pointers(mesh, false);
 
-				CustomData_file_write_prepare(&mesh->vdata, &vlayers, vlayers_buff, ARRAY_SIZE(vlayers_buff));
-				CustomData_file_write_prepare(&mesh->edata, &elayers, elayers_buff, ARRAY_SIZE(elayers_buff));
-				CustomData_file_write_prepare(&mesh->fdata, &flayers, flayers_buff, ARRAY_SIZE(flayers_buff));
+			CustomData_file_write_prepare(&mesh->vdata, &vlayers, vlayers_buff, ARRAY_SIZE(vlayers_buff));
+			CustomData_file_write_prepare(&mesh->edata, &elayers, elayers_buff, ARRAY_SIZE(elayers_buff));
+			CustomData_file_write_prepare(&mesh->fdata, &flayers, flayers_buff, ARRAY_SIZE(flayers_buff));
 #if 0
-				CustomData_file_write_prepare(&mesh->ldata, &llayers, llayers_buff, ARRAY_SIZE(llayers_buff));
-				CustomData_file_write_prepare(&mesh->pdata, &players, players_buff, ARRAY_SIZE(players_buff));
+			CustomData_file_write_prepare(&mesh->ldata, &llayers, llayers_buff, ARRAY_SIZE(llayers_buff));
+			CustomData_file_write_prepare(&mesh->pdata, &players, players_buff, ARRAY_SIZE(players_buff));
 #endif
 
-				writestruct_at_address(wd, ID_ME, Mesh, 1, old_mesh, mesh);
-				write_iddata(wd, &mesh->id);
+			writestruct_at_address(wd, ID_ME, Mesh, 1, old_mesh, mesh);
+			write_iddata(wd, &mesh->id);
 
-				/* direct data */
-				if (mesh->adt) {
-					write_animdata(wd, mesh->adt);
-				}
+			/* direct data */
+			if (mesh->adt) {
+				write_animdata(wd, mesh->adt);
+			}
 
-				writedata(wd, DATA, sizeof(void *) * mesh->totcol, mesh->mat);
-				/* writedata(wd, DATA, sizeof(MSelect) * mesh->totselect, mesh->mselect); */ /* pre-bmesh NULL's */
+			writedata(wd, DATA, sizeof(void *) * mesh->totcol, mesh->mat);
+			/* writedata(wd, DATA, sizeof(MSelect) * mesh->totselect, mesh->mselect); */ /* pre-bmesh NULL's */
 
-				write_customdata(wd, &mesh->id, mesh->totvert, &mesh->vdata, vlayers, -1, 0);
-				write_customdata(wd, &mesh->id, mesh->totedge, &mesh->edata, elayers, -1, 0);
-				write_customdata(wd, &mesh->id, mesh->totface, &mesh->fdata, flayers, -1, 0);
-				/* harmless for older blender versioins but _not_ writing these keeps file size down */
+			write_customdata(wd, &mesh->id, mesh->totvert, &mesh->vdata, vlayers, -1, 0);
+			write_customdata(wd, &mesh->id, mesh->totedge, &mesh->edata, elayers, -1, 0);
+			write_customdata(wd, &mesh->id, mesh->totface, &mesh->fdata, flayers, -1, 0);
+			/* harmless for older blender versioins but _not_ writing these keeps file size down */
 #if 0
-				write_customdata(wd, &mesh->id, mesh->totloop, &mesh->ldata, llayers, -1, 0);
-				write_customdata(wd, &mesh->id, mesh->totpoly, &mesh->pdata, players, -1, 0);
+			write_customdata(wd, &mesh->id, mesh->totloop, &mesh->ldata, llayers, -1, 0);
+			write_customdata(wd, &mesh->id, mesh->totpoly, &mesh->pdata, players, -1, 0);
 #endif
 
-				CustomData_free(&mesh->fdata, mesh->totface);
-				flayers = NULL;
+			CustomData_free(&mesh->fdata, mesh->totface);
+			flayers = NULL;
 
-				/* restore pointer */
-				mesh = old_mesh;
+			/* restore pointer */
+			mesh = old_mesh;
 #endif /* USE_BMESH_SAVE_AS_COMPAT */
-			}
-		}
-
-		if (vlayers && vlayers != vlayers_buff) {
-			MEM_freeN(vlayers);
 		}
-		if (elayers && elayers != elayers_buff) {
-			MEM_freeN(elayers);
-		}
-		if (flayers && flayers != flayers_buff) {
-			MEM_freeN(flayers);
-		}
-		if (llayers && llayers != llayers_buff) {
-			MEM_freeN(llayers);
-		}
-		if (players && players != players_buff) {
-			MEM_freeN(players);
-		}
-
-		mesh = mesh->id.next;
 	}
 
-	mywrite_flush(wd);
+	if (vlayers && vlayers != vlayers_buff) {
+		MEM_freeN(vlayers);
+	}
+	if (elayers && elayers != elayers_buff) {
+		MEM_freeN(elayers);
+	}
+	if (flayers && flayers != flayers_buff) {
+		MEM_freeN(flayers);
+	}
+	if (llayers && llayers != llayers_buff) {
+		MEM_freeN(llayers);
+	}
+	if (players && players != players_buff) {
+		MEM_freeN(players);
+	}
 }
 
-static void write_lattices(WriteData *wd, ListBase *idbase)
+static void write_lattice(WriteData *wd, Lattice *lt)
 {
-	Lattice *lt;
-
-	lt = idbase->first;
-	while (lt) {
-		if (lt->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_LT, Lattice, 1, lt);
-			write_iddata(wd, &lt->id);
-
-			/* write animdata */
-			if (lt->adt) {
-				write_animdata(wd, lt->adt);
-			}
+	if (lt->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_LT, Lattice, 1, lt);
+		write_iddata(wd, &lt->id);
 
-			/* direct data */
-			writestruct(wd, DATA, BPoint, lt->pntsu * lt->pntsv * lt->pntsw, lt->def);
+		/* write animdata */
+		if (lt->adt) {
+			write_animdata(wd, lt->adt);
+		}
 
-			write_dverts(wd, lt->pntsu * lt->pntsv * lt->pntsw, lt->dvert);
+		/* direct data */
+		writestruct(wd, DATA, BPoint, lt->pntsu * lt->pntsv * lt->pntsw, lt->def);
 
-		}
-		lt = lt->id.next;
+		write_dverts(wd, lt->pntsu * lt->pntsv * lt->pntsw, lt->dvert);
 	}
-
-	mywrite_flush(wd);
 }
 
-static void write_images(WriteData *wd, ListBase *idbase)
+static void write_image(WriteData *wd, Image *ima)
 {
-	Image *ima;
-	PackedFile *pf;
-	ImageView *iv;
-	ImagePackedFile *imapf;
-
-	ima = idbase->first;
-	while (ima) {
-		if (ima->id.us > 0 || wd->current) {
-			/* Some trickery to keep forward compatibility of packed images. */
-			BLI_assert(ima->packedfile == NULL);
-			if (ima->packedfiles.first != NULL) {
-				imapf = ima->packedfiles.first;
-				ima->packedfile = imapf->packedfile;
-			}
-
-			/* write LibData */
-			writestruct(wd, ID_IM, Image, 1, ima);
-			write_iddata(wd, &ima->id);
+	if (ima->id.us > 0 || wd->current) {
+		ImagePackedFile *imapf;
 
-			for (imapf = ima->packedfiles.first; imapf; imapf = imapf->next) {
-				writestruct(wd, DATA, ImagePackedFile, 1, imapf);
-				if (imapf->packedfile) {
-					pf = imapf->packedfile;
-					writestruct(wd, DATA, PackedFile, 1, pf);
-					writedata(wd, DATA, pf->size, pf->data);
-				}
-			}
+		/* Some trickery to keep forward compatibility of packed images. */
+		BLI_assert(ima->packedfile == NULL);
+		if (ima->packedfiles.first != NULL) {
+			imapf = ima->packedfiles.first;
+			ima->packedfile = imapf->packedfile;
+		}
 
-			write_previews(wd, ima->preview);
+		/* write LibData */
+		writestruct(wd, ID_IM, Image, 1, ima);
+		write_iddata(wd, &ima->id);
 
-			for (iv = ima->views.first; iv; iv = iv->next) {
-				writestruct(wd, DATA, ImageView, 1, iv);
+		for (imapf = ima->packedfiles.first; imapf; imapf = imapf->next) {
+			writestruct(wd, DATA, ImagePackedFile, 1, imapf);
+			if (imapf->packedfile) {
+				PackedFile *pf = imapf->packedfile;
+				writestruct(wd, DATA, PackedFile, 1, pf);
+				writedata(wd, DATA, pf->size, pf->data);
 			}
-			writestruct(wd, DATA, Stereo3dFormat, 1, ima->stereo3d_format);
+		}
 
-			ima->packedfile = NULL;
+		write_previews(wd, ima->preview);
+
+		for (ImageView *iv = ima->views.first; iv; iv = iv->next) {
+			writestruct(wd, DATA, ImageView, 1, iv);
 		}
-		ima = ima->id.next;
-	}
+		writestruct(wd, DATA, Stereo3dFormat, 1, ima->stereo3d_format);
 
-	mywrite_flush(wd);
+		ima->packedfile = NULL;
+	}
 }
 
-static void write_textures(WriteData *wd, ListBase *idbase)
+static void write_texture(WriteData *wd, Tex *tex)
 {
-	Tex *tex;
-
-	tex = idbase->first;
-	while (tex) {
-		if (tex->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_TE, Tex, 1, tex);
-			write_iddata(wd, &tex->id);
+	if (tex->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_TE, Tex, 1, tex);
+		write_iddata(wd, &tex->id);
 
-			if (tex->adt) {
-				write_animdata(wd, tex->adt);
-			}
+		if (tex->adt) {
+			write_animdata(wd, tex->adt);
+		}
 
-			/* direct data */
-			if (tex->coba) {
-				writestruct(wd, DATA, ColorBand, 1, tex->coba);
-			}
-			if (tex->type == TEX_ENVMAP && tex->env) {
-				writestruct(wd, DATA, EnvMap, 1, tex->env);
-			}
-			if (tex->type == TEX_POINTDENSITY && tex->pd) {
-				writestruct(wd, DATA, PointDensity, 1, tex->pd);
-				if (tex->pd->coba) {
-					writestruct(wd, DATA, ColorBand, 1, tex->pd->coba);
-				}
-				if (tex->pd->falloff_curve) {
-					write_curvemapping(wd, tex->pd->falloff_curve);
-				}
-			}
-			if (tex->type == TEX_VOXELDATA) {
-				writestruct(wd, DATA, VoxelData, 1, tex->vd);
-			}
-			if (tex->type == TEX_OCEAN && tex->ot) {
-				writestruct(wd, DATA, OceanTex, 1, tex->ot);
+		/* direct data */
+		if (tex->coba) {
+			writestruct(wd, DATA, ColorBand, 1, tex->coba);
+		}
+		if (tex->type == TEX_ENVMAP && tex->env) {
+			writestruct(wd, DATA, EnvMap, 1, tex->env);
+		}
+		if (tex->type == TEX_POINTDENSITY && tex->pd) {
+			writestruct(wd, DATA, PointDensity, 1, tex->pd);
+			if (tex->pd->coba) {
+				writestruct(wd, DATA, ColorBand, 1, tex->pd->coba);
 			}
-
-			/* nodetree is integral part of texture, no libdata */
-			if (tex->nodetree) {
-				writestruct(wd, DATA, bNodeTree, 1, tex->nodetree);
-				write_nodetree(wd, tex->nodetree);
+			if (tex->pd->falloff_curve) {
+				write_curvemapping(wd, tex->pd->falloff_curve);
 			}
+		}
+		if (tex->type == TEX_VOXELDATA) {
+			writestruct(wd, DATA, VoxelData, 1, tex->vd);
+		}
+		if (tex->type == TEX_OCEAN && tex->ot) {
+			writestruct(wd, DATA, OceanTex, 1, tex->ot);
+		}
 
-			write_previews(wd, tex->preview);
+		/* nodetree is integral part of texture, no libdata */
+		if (tex->nodetree) {
+			writestruct(wd, DATA, bNodeTree, 1, tex->nodetree);
+			write_nodetree_nolib(wd, tex->nodetree);
 		}
-		tex = tex->id.next;
-	}
 
-	mywrite_flush(wd);
+		write_previews(wd, tex->preview);
+	}
 }
 
-static void write_materials(WriteData *wd, ListBase *idbase)
+static void write_material(WriteData *wd, Material *ma)
 {
-	Material *ma;
-	int a;
-
-	ma = idbase->first;
-	while (ma) {
-		if (ma->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_MA, Material, 1, ma);
-			write_iddata(wd, &ma->id);
-
-			if (ma->adt) {
-				write_animdata(wd, ma->adt);
-			}
+	if (ma->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_MA, Material, 1, ma);
+		write_iddata(wd, &ma->id);
 
-			for (a = 0; a < MAX_MTEX; a++) {
-				if (ma->mtex[a]) {
-					writestruct(wd, DATA, MTex, 1, ma->mtex[a]);
-				}
-			}
+		if (ma->adt) {
+			write_animdata(wd, ma->adt);
+		}
 
-			if (ma->ramp_col) {
-				writestruct(wd, DATA, ColorBand, 1, ma->ramp_col);
-			}
-			if (ma->ramp_spec) {
-				writestruct(wd, DATA, ColorBand, 1, ma->ramp_spec);
+		for (int a = 0; a < MAX_MTEX; a++) {
+			if (ma->mtex[a]) {
+				writestruct(wd, DATA, MTex, 1, ma->mtex[a]);
 			}
+		}
 
-			/* nodetree is integral part of material, no libdata */
-			if (ma->nodetree) {
-				writestruct(wd, DATA, bNodeTree, 1, ma->nodetree);
-				write_nodetree(wd, ma->nodetree);
-			}
+		if (ma->ramp_col) {
+			writestruct(wd, DATA, ColorBand, 1, ma->ramp_col);
+		}
+		if (ma->ramp_spec) {
+			writestruct(wd, DATA, ColorBand, 1, ma->ramp_spec);
+		}
 
-			write_previews(wd, ma->preview);
+		/* nodetree is integral part of material, no libdata */
+		if (ma->nodetree) {
+			writestruct(wd, DATA, bNodeTree, 1, ma->nodetree);
+			write_nodetree_nolib(wd, ma->nodetree);
 		}
-		ma = ma->id.next;
+
+		write_previews(wd, ma->preview);
 	}
 }
 
-static void write_worlds(WriteData *wd, ListBase *idbase)
+static void write_world(WriteData *wd, World *wrld)
 {
-	World *wrld;
-	int a;
-
-	wrld = idbase->first;
-	while (wrld) {
-		if (wrld->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_WO, World, 1, wrld);
-			write_iddata(wd, &wrld->id);
-
-			if (wrld->adt) {
-				write_animdata(wd, wrld->adt);
-			}
+	if (wrld->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_WO, World, 1, wrld);
+		write_iddata(wd, &wrld->id);
 
-			for (a = 0; a < MAX_MTEX; a++) {
-				if (wrld->mtex[a]) {
-					writestruct(wd, DATA, MTex, 1, wrld->mtex[a]);
-				}
-			}
+		if (wrld->adt) {
+			write_animdata(wd, wrld->adt);
+		}
 
-			/* nodetree is integral part of world, no libdata */
-			if (wrld->nodetree) {
-				writestruct(wd, DATA, bNodeTree, 1, wrld->nodetree);
-				write_nodetree(wd, wrld->nodetree);
+		for (int a = 0; a < MAX_MTEX; a++) {
+			if (wrld->mtex[a]) {
+				writestruct(wd, DATA, MTex, 1, wrld->mtex[a]);
 			}
+		}
 
-			write_previews(wd, wrld->preview);
+		/* nodetree is integral part of world, no libdata */
+		if (wrld->nodetree) {
+			writestruct(wd, DATA, bNodeTree, 1, wrld->nodetree);
+			write_nodetree_nolib(wd, wrld->nodetree);
 		}
-		wrld = wrld->id.next;
+
+		write_previews(wd, wrld->preview);
 	}
 }
 
-static void write_lamps(WriteData *wd, ListBase *idbase)
+static void write_lamp(WriteData *wd, Lamp *la)
 {
-	Lamp *la;
-	int a;
-
-	la = idbase->first;
-	while (la) {
-		if (la->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_LA, Lamp, 1, la);
-			write_iddata(wd, &la->id);
-
-			if (la->adt) {
-				write_animdata(wd, la->adt);
-			}
-
-			/* direct data */
-			for (a = 0; a < MAX_MTEX; a++) {
-				if (la->mtex[a]) {
-					writestruct(wd, DATA, MTex, 1, la->mtex[a]);
-				}
-			}
+	if (la->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_LA, Lamp, 1, la);
+		write_iddata(wd, &la->id);
 
-			if (la->curfalloff) {
-				write_curvemapping(wd, la->curfalloff);
-			}
+		if (la->adt) {
+			write_animdata(wd, la->adt);
+		}
 
-			/* nodetree is integral part of lamps, no libdata */
-			if (la->nodetree) {
-				writestruct(wd, DATA, bNodeTree, 1, la->nodetree);
-				write_nodetree(wd, la->nodetree);
+		/* direct data */
+		for (int a = 0; a < MAX_MTEX; a++) {
+			if (la->mtex[a]) {
+				writestruct(wd, DATA, MTex, 1, la->mtex[a]);
 			}
+		}
 
-			write_previews(wd, la->preview);
+		if (la->curfalloff) {
+			write_curvemapping(wd, la->curfalloff);
+		}
 
+		/* nodetree is integral part of lamps, no libdata */
+		if (la->nodetree) {
+			writestruct(wd, DATA, bNodeTree, 1, la->nodetree);
+			write_nodetree_nolib(wd, la->nodetree);
 		}
-		la = la->id.next;
-	}
 
-	mywrite_flush(wd);
+		write_previews(wd, la->preview);
+	}
 }
 
 static void write_sequence_modifiers(WriteData *wd, ListBase *modbase)
@@ -2626,294 +2536,253 @@ static void write_paint(WriteData *wd, Paint *p)
 	}
 }
 
-static void write_scenes(WriteData *wd, ListBase *scebase)
+static void write_scene(WriteData *wd, Scene *sce)
 {
-	Scene *sce;
-	Base *base;
-	Editing *ed;
-	Sequence *seq;
-	MetaStack *ms;
-	Strip *strip;
-	TimeMarker *marker;
-	TransformOrientation *ts;
-	SceneRenderLayer *srl;
-	SceneRenderView *srv;
-	ToolSettings *tos;
-	FreestyleModuleConfig *fmc;
-	FreestyleLineSet *fls;
-
-	sce = scebase->first;
-	while (sce) {
-		/* write LibData */
-		writestruct(wd, ID_SCE, Scene, 1, sce);
-		write_iddata(wd, &sce->id);
+	/* write LibData */
+	writestruct(wd, ID_SCE, Scene, 1, sce);
+	write_iddata(wd, &sce->id);
 
-		if (sce->adt) {
-			write_animdata(wd, sce->adt);
-		}
-		write_keyingsets(wd, &sce->keyingsets);
+	if (sce->adt) {
+		write_animdata(wd, sce->adt);
+	}
+	write_keyingsets(wd, &sce->keyingsets);
 
-		/* direct data */
-		base = sce->base.first;
-		while (base) {
-			writestruct(wd, DATA, Base, 1, base);
-			base = base->next;
-		}
-
-		tos = sce->toolsettings;
-		writestruct(wd, DATA, ToolSettings, 1, tos);
-		if (tos->vpaint) {
-			writestruct(wd, DATA, VPaint, 1, tos->vpaint);
-			write_paint(wd, &tos->vpaint->paint);
-		}
-		if (tos->wpaint) {
-			writestruct(wd, DATA, VPaint, 1, tos->wpaint);
-			write_paint(wd, &tos->wpaint->paint);
-		}
-		if (tos->sculpt) {
-			writestruct(wd, DATA, Sculpt, 1, tos->sculpt);
-			write_paint(wd, &tos->sculpt->paint);
-		}
-		if (tos->uvsculpt) {
-			writestruct(wd, DATA, UvSculpt, 1, tos->uvsculpt);
-			write_paint(wd, &tos->uvsculpt->paint);
-		}
-		/* write grease-pencil drawing brushes to file */
-		writelist(wd, DATA, bGPDbrush, &tos->gp_brushes);
-		for (bGPDbrush *brush = tos->gp_brushes.first; brush; brush = brush->next) {
-			if (brush->cur_sensitivity) {
-				write_curvemapping(wd, brush->cur_sensitivity);
-			}
-			if (brush->cur_strength) {
-				write_curvemapping(wd, brush->cur_strength);
-			}
-			if (brush->cur_jitter) {
-				write_curvemapping(wd, brush->cur_jitter);
-			}
+	/* direct data */
+	for (Base *base = sce->base.first; base; base = base->next) {
+		writestruct(wd, DATA, Base, 1, base);
+	}
+
+	ToolSettings *tos = sce->toolsettings;
+	writestruct(wd, DATA, ToolSettings, 1, tos);
+	if (tos->vpaint) {
+		writestruct(wd, DATA, VPaint, 1, tos->vpaint);
+		write_paint(wd, &tos->vpaint->paint);
+	}
+	if (tos->wpaint) {
+		writestruct(wd, DATA, VPaint, 1, tos->wpaint);
+		write_paint(wd, &tos->wpaint->paint);
+	}
+	if (tos->sculpt) {
+		writestruct(wd, DATA, Sculpt, 1, tos->sculpt);
+		write_paint(wd, &tos->sculpt->paint);
+	}
+	if (tos->uvsculpt) {
+		writestruct(wd, DATA, UvSculpt, 1, tos->uvsculpt);
+		write_paint(wd, &tos->uvsculpt->paint);
+	}
+	/* write grease-pencil drawing brushes to file */
+	writelist(wd, DATA, bGPDbrush, &tos->gp_brushes);
+	for (bGPDbrush *brush = tos->gp_brushes.first; brush; brush = brush->next) {
+		if (brush->cur_sensitivity) {
+			write_curvemapping(wd, brush->cur_sensitivity);
 		}
-		/* write grease-pencil custom ipo curve to file */
-		if (tos->gp_interpolate.custom_ipo) {
-			write_curvemapping(wd, tos->gp_interpolate.custom_ipo);
+		if (brush->cur_strength) {
+			write_curvemapping(wd, brush->cur_strength);
 		}
-		
+		if (brush->cur_jitter) {
+			write_curvemapping(wd, brush->cur_jitter);
+		}
+	}
+	/* write grease-pencil custom ipo curve to file */
+	if (tos->gp_interpolate.custom_ipo) {
+		write_curvemapping(wd, tos->gp_interpolate.custom_ipo);
+	}
 
-		write_paint(wd, &tos->imapaint.paint);
 
-		ed = sce->ed;
-		if (ed) {
-			writestruct(wd, DATA, Editing, 1, ed);
+	write_paint(wd, &tos->imapaint.paint);
 
-			/* reset write flags too */
+	Editing *ed = sce->ed;
+	if (ed) {
+		Sequence *seq;
 
-			SEQ_BEGIN(ed, seq)
-			{
-				if (seq->strip) {
-					seq->strip->done = false;
-				}
-				writestruct(wd, DATA, Sequence, 1, seq);
-			}
-			SEQ_END
+		writestruct(wd, DATA, Editing, 1, ed);
 
-			SEQ_BEGIN(ed, seq)
-			{
-				if (seq->strip && seq->strip->done == 0) {
-					/* write strip with 'done' at 0 because readfile */
-
-					if (seq->effectdata) {
-						switch (seq->type) {
-							case SEQ_TYPE_COLOR:
-								writestruct(wd, DATA, SolidColorVars, 1, seq->effectdata);
-								break;
-							case SEQ_TYPE_SPEED:
-								writestruct(wd, DATA, SpeedControlVars, 1, seq->effectdata);
-								break;
-							case SEQ_TYPE_WIPE:
-								writestruct(wd, DATA, WipeVars, 1, seq->effectdata);
-								break;
-							case SEQ_TYPE_GLOW:
-								writestruct(wd, DATA, GlowVars, 1, seq->effectdata);
-								break;
-							case SEQ_TYPE_TRANSFORM:
-								writestruct(wd, DATA, TransformVars, 1, seq->effectdata);
-								break;
-							case SEQ_TYPE_GAUSSIAN_BLUR:
-								writestruct(wd, DATA, GaussianBlurVars, 1, seq->effectdata);
-								break;
-							case SEQ_TYPE_TEXT:
-								writestruct(wd, DATA, TextVars, 1, seq->effectdata);
-								break;
-						}
-					}
+		/* reset write flags too */
 
-					writestruct(wd, DATA, Stereo3dFormat, 1, seq->stereo3d_format);
+		SEQ_BEGIN(ed, seq)
+		{
+			if (seq->strip) {
+				seq->strip->done = false;
+			}
+			writestruct(wd, DATA, Sequence, 1, seq);
+		}
+		SEQ_END
 
-					strip = seq->strip;
-					writestruct(wd, DATA, Strip, 1, strip);
-					if (seq->flag & SEQ_USE_CROP && strip->crop) {
-						writestruct(wd, DATA, StripCrop, 1, strip->crop);
-					}
-					if (seq->flag & SEQ_USE_TRANSFORM && strip->transform) {
-						writestruct(wd, DATA, StripTransform, 1, strip->transform);
-					}
-					if (seq->flag & SEQ_USE_PROXY && strip->proxy) {
-						writestruct(wd, DATA, StripProxy, 1, strip->proxy);
-					}
-					if (seq->type == SEQ_TYPE_IMAGE) {
-						writestruct(wd, DATA, StripElem,
-						            MEM_allocN_len(strip->stripdata) / sizeof(struct StripElem),
-						            strip->stripdata);
-					}
-					else if (ELEM(seq->type, SEQ_TYPE_MOVIE, SEQ_TYPE_SOUND_RAM, SEQ_TYPE_SOUND_HD)) {
-						writestruct(wd, DATA, StripElem, 1, strip->stripdata);
+		SEQ_BEGIN(ed, seq)
+		{
+			if (seq->strip && seq->strip->done == 0) {
+				/* write strip with 'done' at 0 because readfile */
+
+				if (seq->effectdata) {
+					switch (seq->type) {
+						case SEQ_TYPE_COLOR:
+							writestruct(wd, DATA, SolidColorVars, 1, seq->effectdata);
+							break;
+						case SEQ_TYPE_SPEED:
+							writestruct(wd, DATA, SpeedControlVars, 1, seq->effectdata);
+							break;
+						case SEQ_TYPE_WIPE:
+							writestruct(wd, DATA, WipeVars, 1, seq->effectdata);
+							break;
+						case SEQ_TYPE_GLOW:
+							writestruct(wd, DATA, GlowVars, 1, seq->effectdata);
+							break;
+						case SEQ_TYPE_TRANSFORM:
+							writestruct(wd, DATA, TransformVars, 1, seq->effectdata);
+							break;
+						case SEQ_TYPE_GAUSSIAN_BLUR:
+							writestruct(wd, DATA, GaussianBlurVars, 1, seq->effectdata);
+							break;
+						case SEQ_TYPE_TEXT:
+							writestruct(wd, DATA, TextVars, 1, seq->effectdata);
+							break;
 					}
-
-					strip->done = true;
 				}
 
-				if (seq->prop) {
-					IDP_WriteProperty(seq->prop, wd);
+				writestruct(wd, DATA, Stereo3dFormat, 1, seq->stereo3d_format);
+
+				Strip *strip = seq->strip;
+				writestruct(wd, DATA, Strip, 1, strip);
+				if (seq->flag & SEQ_USE_CROP && strip->crop) {
+					writestruct(wd, DATA, StripCrop, 1, strip->crop);
+				}
+				if (seq->flag & SEQ_USE_TRANSFORM && strip->transform) {
+					writestruct(wd, DATA, StripTransform, 1, strip->transform);
+				}
+				if (seq->flag & SEQ_USE_PROXY && strip->proxy) {
+					writestruct(wd, DATA, StripProxy, 1, strip->proxy);
+				}
+				if (seq->type == SEQ_TYPE_IMAGE) {
+					writestruct(wd, DATA, StripElem,
+								MEM_allocN_len(strip->stripdata) / sizeof(struct StripElem),
+								strip->stripdata);
+				}
+				else if (ELEM(seq->type, SEQ_TYPE_MOVIE, SEQ_TYPE_SOUND_RAM, SEQ_TYPE_SOUND_HD)) {
+					writestruct(wd, DATA, StripElem, 1, strip->stripdata);
 				}
 
-				write_sequence_modifiers(wd, &seq->modifiers);
+				strip->done = true;
 			}
-			SEQ_END
 
-			/* new; meta stack too, even when its nasty restore code */
-			for (ms = ed->metastack.first; ms; ms = ms->next) {
-				writestruct(wd, DATA, MetaStack, 1, ms);
+			if (seq->prop) {
+				IDP_WriteProperty(seq->prop, wd);
 			}
+
+			write_sequence_modifiers(wd, &seq->modifiers);
 		}
+		SEQ_END
 
-		if (sce->r.avicodecdata) {
-			writestruct(wd, DATA, AviCodecData, 1, sce->r.avicodecdata);
-			if (sce->r.avicodecdata->lpFormat) {
-				writedata(wd, DATA, sce->r.avicodecdata->cbFormat, sce->r.avicodecdata->lpFormat);
-			}
-			if (sce->r.avicodecdata->lpParms) {
-				writedata(wd, DATA, sce->r.avicodecdata->cbParms, sce->r.avicodecdata->lpParms);
-			}
+		/* new; meta stack too, even when its nasty restore code */
+		for (MetaStack *ms = ed->metastack.first; ms; ms = ms->next) {
+			writestruct(wd, DATA, MetaStack, 1, ms);
 		}
+	}
 
-		if (sce->r.qtcodecdata) {
-			writestruct(wd, DATA, QuicktimeCodecData, 1, sce->r.qtcodecdata);
-			if (sce->r.qtcodecdata->cdParms) {
-				writedata(wd, DATA, sce->r.qtcodecdata->cdSize, sce->r.qtcodecdata->cdParms);
-			}
+	if (sce->r.avicodecdata) {
+		writestruct(wd, DATA, AviCodecData, 1, sce->r.avicodecdata);
+		if (sce->r.avicodecdata->lpFormat) {
+			writedata(wd, DATA, sce->r.avicodecdata->cbFormat, sce->r.avicodecdata->lpFormat);
 		}
-		if (sce->r.ffcodecdata.properties) {
-			IDP_WriteProperty(sce->r.ffcodecdata.properties, wd);
+		if (sce->r.avicodecdata->lpParms) {
+			writedata(wd, DATA, sce->r.avicodecdata->cbParms, sce->r.avicodecdata->lpParms);
 		}
+	}
 
-		/* writing dynamic list of TimeMarkers to the blend file */
-		for (marker = sce->markers.first; marker; marker = marker->next) {
-			writestruct(wd, DATA, TimeMarker, 1, marker);
+	if (sce->r.qtcodecdata) {
+		writestruct(wd, DATA, QuicktimeCodecData, 1, sce->r.qtcodecdata);
+		if (sce->r.qtcodecdata->cdParms) {
+			writedata(wd, DATA, sce->r.qtcodecdata->cdSize, sce->r.qtcodecdata->cdParms);
 		}
+	}
+	if (sce->r.ffcodecdata.properties) {
+		IDP_WriteProperty(sce->r.ffcodecdata.properties, wd);
+	}
 
-		/* writing dynamic list of TransformOrientations to the blend file */
-		for (ts = sce->transform_spaces.first; ts; ts = ts->next) {
-			writestruct(wd, DATA, TransformOrientation, 1, ts);
-		}
+	/* writing dynamic list of TimeMarkers to the blend file */
+	for (TimeMarker *marker = sce->markers.first; marker; marker = marker->next) {
+		writestruct(wd, DATA, TimeMarker, 1, marker);
+	}
 
-		for (srl = sce->r.layers.first; srl; srl = srl->next) {
-			writestruct(wd, DATA, SceneRenderLayer, 1, srl);
-			for (fmc = srl->freestyleConfig.modules.first; fmc; fmc = fmc->next) {
-				writestruct(wd, DATA, FreestyleModuleConfig, 1, fmc);
-			}
-			for (fls = srl->freestyleConfig.linesets.first; fls; fls = fls->next) {
-				writestruct(wd, DATA, FreestyleLineSet, 1, fls);
-			}
-		}
+	/* writing dynamic list of TransformOrientations to the blend file */
+	for (TransformOrientation *ts = sce->transform_spaces.first; ts; ts = ts->next) {
+		writestruct(wd, DATA, TransformOrientation, 1, ts);
+	}
 
-		/* writing MultiView to the blend file */
-		for (srv = sce->r.views.first; srv; srv = srv->next) {
-			writestruct(wd, DATA, SceneRenderView, 1, srv);
+	for (SceneRenderLayer *srl = sce->r.layers.first; srl; srl = srl->next) {
+		writestruct(wd, DATA, SceneRenderLayer, 1, srl);
+		for (FreestyleModuleConfig *fmc = srl->freestyleConfig.modules.first; fmc; fmc = fmc->next) {
+			writestruct(wd, DATA, FreestyleModuleConfig, 1, fmc);
 		}
-
-		if (sce->nodetree) {
-			writestruct(wd, DATA, bNodeTree, 1, sce->nodetree);
-			write_nodetree(wd, sce->nodetree);
+		for (FreestyleLineSet *fls = srl->freestyleConfig.linesets.first; fls; fls = fls->next) {
+			writestruct(wd, DATA, FreestyleLineSet, 1, fls);
 		}
+	}
 
-		write_view_settings(wd, &sce->view_settings);
+	/* writing MultiView to the blend file */
+	for (SceneRenderView *srv = sce->r.views.first; srv; srv = srv->next) {
+		writestruct(wd, DATA, SceneRenderView, 1, srv);
+	}
 
-		/* writing RigidBodyWorld data to the blend file */
-		if (sce->rigidbody_world) {
-			writestruct(wd, DATA, RigidBodyWorld, 1, sce->rigidbody_world);
-			writestruct(wd, DATA, EffectorWeights, 1, sce->rigidbody_world->effector_weights);
-			write_pointcaches(wd, &(sce->rigidbody_world->ptcaches));
-		}
+	if (sce->nodetree) {
+		writestruct(wd, DATA, bNodeTree, 1, sce->nodetree);
+		write_nodetree_nolib(wd, sce->nodetree);
+	}
 
-		write_previews(wd, sce->preview);
-		write_curvemapping_curves(wd, &sce->r.mblur_shutter_curve);
+	write_view_settings(wd, &sce->view_settings);
 
-		sce = sce->id.next;
+	/* writing RigidBodyWorld data to the blend file */
+	if (sce->rigidbody_world) {
+		writestruct(wd, DATA, RigidBodyWorld, 1, sce->rigidbody_world);
+		writestruct(wd, DATA, EffectorWeights, 1, sce->rigidbody_world->effector_weights);
+		write_pointcaches(wd, &(sce->rigidbody_world->ptcaches));
 	}
 
-	mywrite_flush(wd);
+	write_previews(wd, sce->preview);
+	write_curvemapping_curves(wd, &sce->r.mblur_shutter_curve);
 }
 
-static void write_gpencils(WriteData *wd, ListBase *lb)
+static void write_gpencil(WriteData *wd, bGPdata *gpd)
 {
-	bGPdata *gpd;
-	bGPDlayer *gpl;
-	bGPDframe *gpf;
-	bGPDstroke *gps;
-	bGPDpalette *palette;
-
-	for (gpd = lb->first; gpd; gpd = gpd->id.next) {
-		if (gpd->id.us > 0 || wd->current) {
-			/* write gpd data block to file */
-			writestruct(wd, ID_GD, bGPdata, 1, gpd);
-			write_iddata(wd, &gpd->id);
-
-			if (gpd->adt) {
-				write_animdata(wd, gpd->adt);
-			}
-
-			/* write grease-pencil layers to file */
-			writelist(wd, DATA, bGPDlayer, &gpd->layers);
-			for (gpl = gpd->layers.first; gpl; gpl = gpl->next) {
-
-				/* write this layer's frames to file */
-				writelist(wd, DATA, bGPDframe, &gpl->frames);
-				for (gpf = gpl->frames.first; gpf; gpf = gpf->next) {
-
-					/* write strokes */
-					writelist(wd, DATA, bGPDstroke, &gpf->strokes);
-					for (gps = gpf->strokes.first; gps; gps = gps->next) {
-						writestruct(wd, DATA, bGPDspoint, gps->totpoints, gps->points);
-					}
+	if (gpd->id.us > 0 || wd->current) {
+		/* write gpd data block to file */
+		writestruct(wd, ID_GD, bGPdata, 1, gpd);
+		write_iddata(wd, &gpd->id);
+
+		if (gpd->adt) {
+			write_animdata(wd, gpd->adt);
+		}
+
+		/* write grease-pencil layers to file */
+		writelist(wd, DATA, bGPDlayer, &gpd->layers);
+		for (bGPDlayer *gpl = gpd->layers.first; gpl; gpl = gpl->next) {
+			/* write this layer's frames to file */
+			writelist(wd, DATA, bGPDframe, &gpl->frames);
+			for (bGPDframe *gpf = gpl->frames.first; gpf; gpf = gpf->next) {
+				/* write strokes */
+				writelist(wd, DATA, bGPDstroke, &gpf->strokes);
+				for (bGPDstroke *gps = gpf->strokes.first; gps; gps = gps->next) {
+					writestruct(wd, DATA, bGPDspoint, gps->totpoints, gps->points);
 				}
 			}
-			/* write grease-pencil palettes */
-			writelist(wd, DATA, bGPDpalette, &gpd->palettes);
-			for (palette = gpd->palettes.first; palette; palette = palette->next) {
-				writelist(wd, DATA, bGPDpalettecolor, &palette->colors);
-			}
 		}
-	}
 
-	mywrite_flush(wd);
+		/* write grease-pencil palettes */
+		writelist(wd, DATA, bGPDpalette, &gpd->palettes);
+		for (bGPDpalette *palette = gpd->palettes.first; palette; palette = palette->next) {
+			writelist(wd, DATA, bGPDpalettecolor, &palette->colors);
+		}
+	}
 }
 
-static void write_windowmanagers(WriteData *wd, ListBase *lb)
+static void write_windowmanager(WriteData *wd, wmWindowManager *wm)
 {
-	wmWindowManager *wm;
-	wmWindow *win;
-
-	for (wm = lb->first; wm; wm = wm->id.next) {
-		writestruct(wd, ID_WM, wmWindowManager, 1, wm);
-		write_iddata(wd, &wm->id);
+	writestruct(wd, ID_WM, wmWindowManager, 1, wm);
+	write_iddata(wd, &wm->id);
 
-		for (win = wm->windows.first; win; win = win->next) {
-			writestruct(wd, DATA, wmWindow, 1, win);
-			writestruct(wd, DATA, Stereo3dFormat, 1, win->stereo3d_format);
-		}
+	for (wmWindow *win = wm->windows.first; win; win = win->next) {
+		writestruct(wd, DATA, wmWindow, 1, win);
+		writestruct(wd, DATA, Stereo3dFormat, 1, win->stereo3d_format);
 	}
-
-	/* typically flushing wouldn't be needed however this data _always_ changes,
-	 * so flush here for more efficient undo. */
-	mywrite_flush(wd);
 }
 
 static void write_region(WriteData *wd, ARegion *ar, int spacetype)
@@ -2996,184 +2865,168 @@ static void write_soops(WriteData *wd, SpaceOops *so)
 	}
 }
 
-static void write_screens(WriteData *wd, ListBase *scrbase)
+static void write_screen(WriteData *wd, bScreen *sc)
 {
-	bScreen *sc;
-	ScrArea *sa;
-	ScrVert *sv;
-	ScrEdge *se;
-
-	sc = scrbase->first;
-	while (sc) {
+	/* write LibData */
+	/* in 2.50+ files, the file identifier for screens is patched, forward compatibility */
+	writestruct(wd, ID_SCRN, bScreen, 1, sc);
+	write_iddata(wd, &sc->id);
 
-		/* write LibData */
-		/* in 2.50+ files, the file identifier for screens is patched, forward compatibility */
-		writestruct(wd, ID_SCRN, bScreen, 1, sc);
-		write_iddata(wd, &sc->id);
+	/* direct data */
+	for (ScrVert *sv = sc->vertbase.first; sv; sv = sv->next) {
+		writestruct(wd, DATA, ScrVert, 1, sv);
+	}
 
-		/* direct data */
-		for (sv = sc->vertbase.first; sv; sv = sv->next) {
-			writestruct(wd, DATA, ScrVert, 1, sv);
-		}
+	for (ScrEdge *se = sc->edgebase.first; se; se = se->next) {
+		writestruct(wd, DATA, ScrEdge, 1, se);
+	}
 
-		for (se = sc->edgebase.first; se; se = se->next) {
-			writestruct(wd, DATA, ScrEdge, 1, se);
-		}
+	for (ScrArea *sa = sc->areabase.first; sa; sa = sa->next) {
+		SpaceLink *sl;
+		Panel *pa;
+		uiList *ui_list;
+		uiPreview *ui_preview;
+		PanelCategoryStack *pc_act;
+		ARegion *ar;
 
-		for (sa = sc->areabase.first; sa; sa = sa->next) {
-			SpaceLink *sl;
-			Panel *pa;
-			uiList *ui_list;
-			uiPreview *ui_preview;
-			PanelCategoryStack *pc_act;
-			ARegion *ar;
+		writestruct(wd, DATA, ScrArea, 1, sa);
 
-			writestruct(wd, DATA, ScrArea, 1, sa);
+		for (ar = sa->regionbase.first; ar; ar = ar->next) {
+			write_region(wd, ar, sa->spacetype);
 
-			for (ar = sa->regionbase.first; ar; ar = ar->next) {
-				write_region(wd, ar, sa->spacetype);
+			for (pa = ar->panels.first; pa; pa = pa->next) {
+				writestruct(wd, DATA, Panel, 1, pa);
+			}
 
-				for (pa = ar->panels.first; pa; pa = pa->next) {
-					writestruct(wd, DATA, Panel, 1, pa);
-				}
+			for (pc_act = ar->panels_category_active.first; pc_act; pc_act = pc_act->next) {
+				writestruct(wd, DATA, PanelCategoryStack, 1, pc_act);
+			}
 
-				for (pc_act = ar->panels_category_active.first; pc_act; pc_act = pc_act->next) {
-					writestruct(wd, DATA, PanelCategoryStack, 1, pc_act);
-				}
+			for (ui_list = ar->ui_lists.first; ui_list; ui_list = ui_list->next) {
+				write_uilist(wd, ui_list);
+			}
 
-				for (ui_list = ar->ui_lists.first; ui_list; ui_list = ui_list->next) {
-					write_uilist(wd, ui_list);
-				}
+			for (ui_preview = ar->ui_previews.first; ui_preview; ui_preview = ui_preview->next) {
+				writestruct(wd, DATA, uiPreview, 1, ui_preview);
+			}
+		}
 
-				for (ui_preview = ar->ui_previews.first; ui_preview; ui_preview = ui_preview->next) {
-					writestruct(wd, DATA, uiPreview, 1, ui_preview);
-				}
+		for (sl = sa->spacedata.first; sl; sl = sl->next) {
+			for (ar = sl->regionbase.first; ar; ar = ar->next) {
+				write_region(wd, ar, sl->spacetype);
 			}
 
-			sl = sa->spacedata.first;
-			while (sl) {
-				for (ar = sl->regionbase.first; ar; ar = ar->next) {
-					write_region(wd, ar, sl->spacetype);
+			if (sl->spacetype == SPACE_VIEW3D) {
+				View3D *v3d = (View3D *)sl;
+				BGpic *bgpic;
+				writestruct(wd, DATA, View3D, 1, v3d);
+				for (bgpic = v3d->bgpicbase.first; bgpic; bgpic = bgpic->next) {
+					writestruct(wd, DATA, BGpic, 1, bgpic);
 				}
-
-				if (sl->spacetype == SPACE_VIEW3D) {
-					View3D *v3d = (View3D *)sl;
-					BGpic *bgpic;
-					writestruct(wd, DATA, View3D, 1, v3d);
-					for (bgpic = v3d->bgpicbase.first; bgpic; bgpic = bgpic->next) {
-						writestruct(wd, DATA, BGpic, 1, bgpic);
-					}
-					if (v3d->localvd) {
-						writestruct(wd, DATA, View3D, 1, v3d->localvd);
-					}
-
-					if (v3d->fx_settings.ssao) {
-						writestruct(wd, DATA, GPUSSAOSettings, 1, v3d->fx_settings.ssao);
-					}
-					if (v3d->fx_settings.dof) {
-						writestruct(wd, DATA, GPUDOFSettings, 1, v3d->fx_settings.dof);
-					}
+				if (v3d->localvd) {
+					writestruct(wd, DATA, View3D, 1, v3d->localvd);
 				}
-				else if (sl->spacetype == SPACE_IPO) {
-					SpaceIpo *sipo = (SpaceIpo *)sl;
-					ListBase tmpGhosts = sipo->ghostCurves;
-
-					/* temporarily disable ghost curves when saving */
-					sipo->ghostCurves.first = sipo->ghostCurves.last = NULL;
 
-					writestruct(wd, DATA, SpaceIpo, 1, sl);
-					if (sipo->ads) {
-						writestruct(wd, DATA, bDopeSheet, 1, sipo->ads);
-					}
-
-					/* reenable ghost curves */
-					sipo->ghostCurves = tmpGhosts;
+				if (v3d->fx_settings.ssao) {
+					writestruct(wd, DATA, GPUSSAOSettings, 1, v3d->fx_settings.ssao);
 				}
-				else if (sl->spacetype == SPACE_BUTS) {
-					writestruct(wd, DATA, SpaceButs, 1, sl);
+				if (v3d->fx_settings.dof) {
+					writestruct(wd, DATA, GPUDOFSettings, 1, v3d->fx_settings.dof);
 				}
-				else if (sl->spacetype == SPACE_FILE) {
-					SpaceFile *sfile = (SpaceFile *)sl;
+			}
+			else if (sl->spacetype == SPACE_IPO) {
+				SpaceIpo *sipo = (SpaceIpo *)sl;
+				ListBase tmpGhosts = sipo->ghostCurves;
 
-					writestruct(wd, DATA, SpaceFile, 1, sl);
-					if (sfile->params) {
-						writestruct(wd, DATA, FileSelectParams, 1, sfile->params);
-					}
-				}
-				else if (sl->spacetype == SPACE_SEQ) {
-					writestruct(wd, DATA, SpaceSeq, 1, sl);
-				}
-				else if (sl->spacetype == SPACE_OUTLINER) {
-					SpaceOops *so = (SpaceOops *)sl;
-					write_soops(wd, so);
-				}
-				else if (sl->spacetype == SPACE_IMAGE) {
-					writestruct(wd, DATA, SpaceImage, 1, sl);
-				}
-				else if (sl->spacetype == SPACE_TEXT) {
-					writestruct(wd, DATA, SpaceText, 1, sl);
-				}
-				else if (sl->spacetype == SPACE_SCRIPT) {
-					SpaceScript *scr = (SpaceScript *)sl;
-					scr->but_refs = NULL;
-					writestruct(wd, DATA, SpaceScript, 1, sl);
-				}
-				else if (sl->spacetype == SPACE_ACTION) {
-					writestruct(wd, DATA, SpaceAction, 1, sl);
-				}
-				else if (sl->spacetype == SPACE_NLA) {
-					SpaceNla *snla = (SpaceNla *)sl;
+				/* temporarily disable ghost curves when saving */
+				sipo->ghostCurves.first = sipo->ghostCurves.last = NULL;
 
-					writestruct(wd, DATA, SpaceNla, 1, snla);
-					if (snla->ads) {
-						writestruct(wd, DATA, bDopeSheet, 1, snla->ads);
-					}
-				}
-				else if (sl->spacetype == SPACE_TIME) {
-					writestruct(wd, DATA, SpaceTime, 1, sl);
+				writestruct(wd, DATA, SpaceIpo, 1, sl);
+				if (sipo->ads) {
+					writestruct(wd, DATA, bDopeSheet, 1, sipo->ads);
 				}
-				else if (sl->spacetype == SPACE_NODE) {
-					SpaceNode *snode = (SpaceNode *)sl;
-					bNodeTreePath *path;
-					writestruct(wd, DATA, SpaceNode, 1, snode);
 
-					for (path = snode->treepath.first; path; path = path->next) {
-						writestruct(wd, DATA, bNodeTreePath, 1, path);
-					}
-				}
-				else if (sl->spacetype == SPACE_LOGIC) {
-					writestruct(wd, DATA, SpaceLogic, 1, sl);
-				}
-				else if (sl->spacetype == SPACE_CONSOLE) {
-					SpaceConsole *con = (SpaceConsole *)sl;
-					ConsoleLine *cl;
-
-					for (cl = con->history.first; cl; cl = cl->next) {
-						/* 'len_alloc' is invalid on write, set from 'len' on read */
-						writestruct(wd, DATA, ConsoleLine, 1, cl);
-						writedata(wd, DATA, cl->len + 1, cl->line);
-					}
-					writestruct(wd, DATA, SpaceConsole, 1, sl);
+				/* reenable ghost curves */
+				sipo->ghostCurves = tmpGhosts;
+			}
+			else if (sl->spacetype == SPACE_BUTS) {
+				writestruct(wd, DATA, SpaceButs, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_FILE) {
+				SpaceFile *sfile = (SpaceFile *)sl;
 
+				writestruct(wd, DATA, SpaceFile, 1, sl);
+				if (sfile->params) {
+					writestruct(wd, DATA, FileSelectParams, 1, sfile->params);
 				}
-				else if (sl->spacetype == SPACE_USERPREF) {
-					writestruct(wd, DATA, SpaceUserPref, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_SEQ) {
+				writestruct(wd, DATA, SpaceSeq, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_OUTLINER) {
+				SpaceOops *so = (SpaceOops *)sl;
+				write_soops(wd, so);
+			}
+			else if (sl->spacetype == SPACE_IMAGE) {
+				writestruct(wd, DATA, SpaceImage, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_TEXT) {
+				writestruct(wd, DATA, SpaceText, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_SCRIPT) {
+				SpaceScript *scr = (SpaceScript *)sl;
+				scr->but_refs = NULL;
+				writestruct(wd, DATA, SpaceScript, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_ACTION) {
+				writestruct(wd, DATA, SpaceAction, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_NLA) {
+				SpaceNla *snla = (SpaceNla *)sl;
+
+				writestruct(wd, DATA, SpaceNla, 1, snla);
+				if (snla->ads) {
+					writestruct(wd, DATA, bDopeSheet, 1, snla->ads);
 				}
-				else if (sl->spacetype == SPACE_CLIP) {
-					writestruct(wd, DATA, SpaceClip, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_TIME) {
+				writestruct(wd, DATA, SpaceTime, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_NODE) {
+				SpaceNode *snode = (SpaceNode *)sl;
+				bNodeTreePath *path;
+				writestruct(wd, DATA, SpaceNode, 1, snode);
+
+				for (path = snode->treepath.first; path; path = path->next) {
+					writestruct(wd, DATA, bNodeTreePath, 1, path);
 				}
-				else if (sl->spacetype == SPACE_INFO) {
-					writestruct(wd, DATA, SpaceInfo, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_LOGIC) {
+				writestruct(wd, DATA, SpaceLogic, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_CONSOLE) {
+				SpaceConsole *con = (SpaceConsole *)sl;
+				ConsoleLine *cl;
+
+				for (cl = con->history.first; cl; cl = cl->next) {
+					/* 'len_alloc' is invalid on write, set from 'len' on read */
+					writestruct(wd, DATA, ConsoleLine, 1, cl);
+					writedata(wd, DATA, cl->len + 1, cl->line);
 				}
+				writestruct(wd, DATA, SpaceConsole, 1, sl);
 
-				sl = sl->next;
+			}
+			else if (sl->spacetype == SPACE_USERPREF) {
+				writestruct(wd, DATA, SpaceUserPref, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_CLIP) {
+				writestruct(wd, DATA, SpaceClip, 1, sl);
+			}
+			else if (sl->spacetype == SPACE_INFO) {
+				writestruct(wd, DATA, SpaceInfo, 1, sl);
 			}
 		}
-
-		sc = sc->id.next;
 	}
-
-	mywrite_flush(wd);
 }
 
 static void write_bone(WriteData *wd, Bone *bone)
@@ -3196,156 +3049,101 @@ static void write_bone(WriteData *wd, Bone *bone)
 	}
 }
 
-static void write_armatures(WriteData *wd, ListBase *idbase)
+static void write_armature(WriteData *wd, bArmature *arm)
 {
-	bArmature   *arm;
-	Bone        *bone;
-
-	arm = idbase->first;
-	while (arm) {
-		if (arm->id.us > 0 || wd->current) {
-			writestruct(wd, ID_AR, bArmature, 1, arm);
-			write_iddata(wd, &arm->id);
+	if (arm->id.us > 0 || wd->current) {
+		writestruct(wd, ID_AR, bArmature, 1, arm);
+		write_iddata(wd, &arm->id);
 
-			if (arm->adt) {
-				write_animdata(wd, arm->adt);
-			}
+		if (arm->adt) {
+			write_animdata(wd, arm->adt);
+		}
 
-			/* Direct data */
-			bone = arm->bonebase.first;
-			while (bone) {
-				write_bone(wd, bone);
-				bone = bone->next;
-			}
+		/* Direct data */
+		for (Bone *bone = arm->bonebase.first; bone; bone = bone->next) {
+			write_bone(wd, bone);
 		}
-		arm = arm->id.next;
 	}
-
-	mywrite_flush(wd);
 }
 
-static void write_texts(WriteData *wd, ListBase *idbase)
+static void write_text(WriteData *wd, Text *text)
 {
-	Text *text;
-	TextLine *tmp;
+	if ((text->flags & TXT_ISMEM) && (text->flags & TXT_ISEXT)) {
+		text->flags &= ~TXT_ISEXT;
+	}
 
-	text = idbase->first;
-	while (text) {
-		if ( (text->flags & TXT_ISMEM) && (text->flags & TXT_ISEXT)) {
-			text->flags &= ~TXT_ISEXT;
-		}
+	/* write LibData */
+	writestruct(wd, ID_TXT, Text, 1, text);
+	write_iddata(wd, &text->id);
 
-		/* write LibData */
-		writestruct(wd, ID_TXT, Text, 1, text);
-		write_iddata(wd, &text->id);
+	if (text->name) {
+		writedata(wd, DATA, strlen(text->name) + 1, text->name);
+	}
 
-		if (text->name) {
-			writedata(wd, DATA, strlen(text->name) + 1, text->name);
+	if (!(text->flags & TXT_ISEXT)) {
+		/* now write the text data, in two steps for optimization in the readfunction */
+		for (TextLine *tmp = text->lines.first; tmp; tmp = tmp->next) {
+			writestruct(wd, DATA, TextLine, 1, tmp);
 		}
 
-		if (!(text->flags & TXT_ISEXT)) {
-			/* now write the text data, in two steps for optimization in the readfunction */
-			tmp = text->lines.first;
-			while (tmp) {
-				writestruct(wd, DATA, TextLine, 1, tmp);
-				tmp = tmp->next;
-			}
-
-			tmp = text->lines.first;
-			while (tmp) {
-				writedata(wd, DATA, tmp->len + 1, tmp->line);
-				tmp = tmp->next;
-			}
+		for (TextLine *tmp = text->lines.first; tmp; tmp = tmp->next) {
+			writedata(wd, DATA, tmp->len + 1, tmp->line);
 		}
-
-
-		text = text->id.next;
 	}
-
-	mywrite_flush(wd);
 }
 
-static void write_speakers(WriteData *wd, ListBase *idbase)
+static void write_speaker(WriteData *wd, Speaker *spk)
 {
-	Speaker *spk;
-
-	spk = idbase->first;
-	while (spk) {
-		if (spk->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_SPK, Speaker, 1, spk);
-			write_iddata(wd, &spk->id);
+	if (spk->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_SPK, Speaker, 1, spk);
+		write_iddata(wd, &spk->id);
 
-			if (spk->adt) {
-				write_animdata(wd, spk->adt);
-			}
+		if (spk->adt) {
+			write_animdata(wd, spk->adt);
 		}
-		spk = spk->id.next;
 	}
 }
 
-static void write_sounds(WriteData *wd, ListBase *idbase)
+static void write_sound(WriteData *wd, bSound *sound)
 {
-	bSound *sound;
-
-	PackedFile *pf;
-
-	sound = idbase->first;
-	while (sound) {
-		if (sound->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_SO, bSound, 1, sound);
-			write_iddata(wd, &sound->id);
+	if (sound->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_SO, bSound, 1, sound);
+		write_iddata(wd, &sound->id);
 
-			if (sound->packedfile) {
-				pf = sound->packedfile;
-				writestruct(wd, DATA, PackedFile, 1, pf);
-				writedata(wd, DATA, pf->size, pf->data);
-			}
+		if (sound->packedfile) {
+			PackedFile *pf = sound->packedfile;
+			writestruct(wd, DATA, PackedFile, 1, pf);
+			writedata(wd, DATA, pf->size, pf->data);
 		}
-		sound = sound->id.next;
 	}
-
-	mywrite_flush(wd);
 }
 
-static void write_groups(WriteData *wd, ListBase *idbase)
+static void write_group(WriteData *wd, Group *group)
 {
-	Group *group;
-	GroupObject *go;
-
-	for (group = idbase->first; group; group = group->id.next) {
-		if (group->id.us > 0 || wd->current) {
-			/* write LibData */
-			writestruct(wd, ID_GR, Group, 1, group);
-			write_iddata(wd, &group->id);
+	if (group->id.us > 0 || wd->current) {
+		/* write LibData */
+		writestruct(wd, ID_GR, Group, 1, group);
+		write_iddata(wd, &group->id);
 
-			write_previews(wd, group->preview);
+		write_previews(wd, group->preview);
 
-			go = group->gobject.first;
-			while (go) {
-				writestruct(wd, DATA, GroupObject, 1, go);
-				go = go->next;
-			}
+		for (GroupObject *go = group->gobject.first; go; go = go->next) {
+			writestruct(wd, DATA, GroupObject, 1, go);
 		}
 	}
-
-	mywrite_flush(wd);
 }
 
-static void write_nodetrees(WriteData *wd, ListBase *idbase)
+static void write_nodetree(WriteData *wd, bNodeTree *ntree)
 {
-	bNodeTree *ntree;
-
-	for (ntree = idbase->first; ntree; ntree = ntree->id.next) {
-		if (ntree->id.us > 0 || wd->current) {
-			writestruct(wd, ID_NT, bNodeTree, 1, ntree);
-			/* Note that trees directly used by other IDs (materials etc.) are not 'real' ID, they cannot
-			 * be linked, etc., so we write actual id data here only, for 'real' ID trees. */
-			write_iddata(wd, &ntree->id);
+	if (ntree->id.us > 0 || wd->current) {
+		writestruct(wd, ID_NT, bNodeTree, 1, ntree);
+		/* Note that trees directly used by other IDs (materials etc.) are not 'real' ID, they cannot
+		 * be linked, etc., so we write actual id data here only, for 'real' ID trees. */
+		write_iddata(wd, &ntree->id);
 
-			write_nodetree(wd, ntree);
-		}
+		write_nodetree_nolib(wd, ntree);
 	}
 }
 
@@ -3419,53 +3217,41 @@ static void customnodes_free_deprecated_data(Main *mainvar)
 }
 #endif
 
-static void write_brushes(WriteData *wd, ListBase *idbase)
+static void write_brush(WriteData *wd, Brush *brush)
 {
-	Brush *brush;
-
-	for (brush = idbase->first; brush; brush = brush->id.next) {
-		if (brush->id.us > 0 || wd->current) {
-			writestruct(wd, ID_BR, Brush, 1, brush);
-			write_iddata(wd, &brush->id);
+	if (brush->id.us > 0 || wd->current) {
+		writestruct(wd, ID_BR, Brush, 1, brush);
+		write_iddata(wd, &brush->id);
 
-			if (brush->curve) {
-				write_curvemapping(wd, brush->curve);
-			}
-			if (brush->gradient) {
-				writestruct(wd, DATA, ColorBand, 1, brush->gradient);
-			}
+		if (brush->curve) {
+			write_curvemapping(wd, brush->curve);
+		}
+		if (brush->gradient) {
+			writestruct(wd, DATA, ColorBand, 1, brush->gradient);
 		}
 	}
 }
 
-static void write_palettes(WriteData *wd, ListBase *idbase)
+static void write_palette(WriteData *wd, Palette *palette)
 {
-	Palette *palette;
-
-	for (palette = idbase->first; palette; palette = palette->id.next) {
-		if (palette->id.us > 0 || wd->current) {
-			PaletteColor *color;
-			writestruct(wd, ID_PAL, Palette, 1, palette);
-			write_iddata(wd, &palette->id);
+	if (palette->id.us > 0 || wd->current) {
+		PaletteColor *color;
+		writestruct(wd, ID_PAL, Palette, 1, palette);
+		write_iddata(wd, &palette->id);
 
-			for (color = palette->colors.first; color; color = color->next) {
-				writestruct(wd, DATA, PaletteColor, 1, color);
-			}
+		for (color = palette->colors.first; color; color = color->next) {
+			writestruct(wd, DATA, PaletteColor, 1, color);
 		}
 	}
 }
 
-static void write_paintcurves(WriteData *wd, ListBase *idbase)
+static void write_paintcurve(WriteData *wd, PaintCurve *pc)
 {
-	PaintCurve *pc;
-
-	for (pc = idbase->first; pc; pc = pc->id.next) {
-		if (pc->id.us > 0 || wd->current) {
-			writestruct(wd, ID_PC, PaintCurve, 1, pc);
-			write_iddata(wd, &pc->id);
+	if (pc->id.us > 0 || wd->current) {
+		writestruct(wd, ID_PC, PaintCurve, 1, pc);
+		write_iddata(wd, &pc->id);
 
-			writestruct(wd, DATA, PaintCurvePoint, pc->tot_points, pc->points);
-		}
+		writestruct(wd, DATA, PaintCurvePoint, pc->tot_points, pc->points);
 	}
 }
 
@@ -3507,103 +3293,85 @@ static void write_movieReconstruction(WriteData *wd, MovieTrackingReconstruction
 	}
 }
 
-static void write_movieclips(WriteData *wd, ListBase *idbase)
+static void write_movieclip(WriteData *wd, MovieClip *clip)
 {
-	MovieClip *clip;
-
-	clip = idbase->first;
-	while (clip) {
-		if (clip->id.us > 0 || wd->current) {
-			MovieTracking *tracking = &clip->tracking;
-			MovieTrackingObject *object;
+	if (clip->id.us > 0 || wd->current) {
+		MovieTracking *tracking = &clip->tracking;
+		MovieTrackingObject *object;
 
-			writestruct(wd, ID_MC, MovieClip, 1, clip);
-			write_iddata(wd, &clip->id);
+		writestruct(wd, ID_MC, MovieClip, 1, clip);
+		write_iddata(wd, &clip->id);
 
-			if (clip->adt) {
-				write_animdata(wd, clip->adt);
-			}
+		if (clip->adt) {
+			write_animdata(wd, clip->adt);
+		}
 
-			write_movieTracks(wd, &tracking->tracks);
-			write_moviePlaneTracks(wd, &tracking->plane_tracks);
-			write_movieReconstruction(wd, &tracking->reconstruction);
+		write_movieTracks(wd, &tracking->tracks);
+		write_moviePlaneTracks(wd, &tracking->plane_tracks);
+		write_movieReconstruction(wd, &tracking->reconstruction);
 
-			object = tracking->objects.first;
-			while (object) {
-				writestruct(wd, DATA, MovieTrackingObject, 1, object);
+		object = tracking->objects.first;
+		while (object) {
+			writestruct(wd, DATA, MovieTrackingObject, 1, object);
 
-				write_movieTracks(wd, &object->tracks);
-				write_moviePlaneTracks(wd, &object->plane_tracks);
-				write_movieReconstruction(wd, &object->reconstruction);
+			write_movieTracks(wd, &object->tracks);
+			write_moviePlaneTracks(wd, &object->plane_tracks);
+			write_movieReconstruction(wd, &object->reconstruction);
 
-				object = object->next;
-			}
+			object = object->next;
 		}
-
-		clip = clip->id.next;
 	}
-
-	mywrite_flush(wd);
 }
 
-static void write_masks(WriteData *wd, ListBase *idbase)
+static void write_mask(WriteData *wd, Mask *mask)
 {
-	Mask *mask;
-
-	mask = idbase->first;
-	while (mask) {
-		if (mask->id.us > 0 || wd->current) {
-			MaskLayer *masklay;
+	if (mask->id.us > 0 || wd->current) {
+		MaskLayer *masklay;
 
-			writestruct(wd, ID_MSK, Mask, 1, mask);
-			write_iddata(wd, &mask->id);
+		writestruct(wd, ID_MSK, Mask, 1, mask);
+		write_iddata(wd, &mask->id);
 
-			if (mask->adt) {
-				write_animdata(wd, mask->adt);
-			}
+		if (mask->adt) {
+			write_animdata(wd, mask->adt);
+		}
 
-			for (masklay = mask->masklayers.first; masklay; masklay = masklay->next) {
-				MaskSpline *spline;
-				MaskLayerShape *masklay_shape;
+		for (masklay = mask->masklayers.first; masklay; masklay = masklay->next) {
+			MaskSpline *spline;
+			MaskLayerShape *masklay_shape;
 
-				writestruct(wd, DATA, MaskLayer, 1, masklay);
+			writestruct(wd, DATA, MaskLayer, 1, masklay);
 
-				for (spline = masklay->splines.first; spline; spline = spline->next) {
-					int i;
+			for (spline = masklay->splines.first; spline; spline = spline->next) {
+				int i;
 
-					void *points_deform = spline->points_deform;
-					spline->points_deform = NULL;
+				void *points_deform = spline->points_deform;
+				spline->points_deform = NULL;
 
-					writestruct(wd, DATA, MaskSpline, 1, spline);
-					writestruct(wd, DATA, MaskSplinePoint, spline->tot_point, spline->points);
+				writestruct(wd, DATA, MaskSpline, 1, spline);
+				writestruct(wd, DATA, MaskSplinePoint, spline->tot_point, spline->points);
 
-					spline->points_deform = points_deform;
+				spline->points_deform = points_deform;
 
-					for (i = 0; i < spline->tot_point; i++) {
-						MaskSplinePoint *point = &spline->points[i];
+				for (i = 0; i < spline->tot_point; i++) {
+					MaskSplinePoint *point = &spline->points[i];
 
-						if (point->tot_uw) {
-							writestruct(wd, DATA, MaskSplinePointUW, point->tot_uw, point->uw);
-						}
+					if (point->tot_uw) {
+						writestruct(wd, DATA, MaskSplinePointUW, point->tot_uw, point->uw);
 					}
 				}
+			}
 
-				for (masklay_shape = masklay->splines_shapes.first;
-				     masklay_shape;
-				     masklay_shape = masklay_shape->next)
-				{
-					writestruct(wd, DATA, MaskLayerShape, 1, masklay_shape);
-					writedata(wd, DATA,
-					          masklay_shape->tot_vert * sizeof(float) * MASK_OBJECT_SHAPE_ELEM_SIZE,
-					          masklay_shape->data);
-				}
+			for (masklay_shape = masklay->splines_shapes.first;
+				 masklay_shape;
+				 masklay_shape = masklay_shape->next)
+			{
+				writestruct(wd, DATA, MaskLayerShape, 1, masklay_shape);
+				writedata(wd, DATA,
+						  masklay_shape->tot_vert * sizeof(float) * MASK_OBJECT_SHAPE_ELEM_SIZE,
+						  masklay_shape->data);
 			}
 		}
-
-		mask = mask->id.next;
 	}
-
-	mywrite_flush(wd);
 }
 
 static void write_linestyle_color_modifiers(WriteData *wd, ListBase *modifiers)
@@ -3860,48 +3628,39 @@ static void write_linestyle_geometry_modifiers(WriteData *wd, ListBase *modifier
 	}
 }
 
-static void write_linestyles(WriteData *wd, ListBase *idbase)
+static void write_linestyle(WriteData *wd, FreestyleLineStyle *linestyle)
 {
-	FreestyleLineStyle *linestyle;
-	int a;
+	if (linestyle->id.us > 0 || wd->current) {
+		writestruct(wd, ID_LS, FreestyleLineStyle, 1, linestyle);
+		write_iddata(wd, &linestyle->id);
 
-	for (linestyle = idbase->first; linestyle; linestyle = linestyle->id.next) {
-		if (linestyle->id.us > 0 || wd->current) {
-			writestruct(wd, ID_LS, FreestyleLineStyle, 1, linestyle);
-			write_iddata(wd, &linestyle->id);
-
-			if (linestyle->adt) {
-				write_animdata(wd, linestyle->adt);
-			}
+		if (linestyle->adt) {
+			write_animdata(wd, linestyle->adt);
+		}
 
-			write_linestyle_color_modifiers(wd, &linestyle->color_modifiers);
-			write_linestyle_alpha_modifiers(wd, &linestyle->alpha_modifiers);
-			write_linestyle_thickness_modifiers(wd, &linestyle->thickness_modifiers);
-			write_linestyle_geometry_modifiers(wd, &linestyle->geometry_modifiers);
-			for (a = 0; a < MAX_MTEX; a++) {
-				if (linestyle->mtex[a]) {
-					writestruct(wd, DATA, MTex, 1, linestyle->mtex[a]);
-				}
-			}
-			if (linestyle->nodetree) {
-				writestruct(wd, DATA, bNodeTree, 1, linestyle->nodetree);
-				write_nodetree(wd, linestyle->nodetree);
+		write_linestyle_color_modifiers(wd, &linestyle->color_modifiers);
+		write_linestyle_alpha_modifiers(wd, &linestyle->alpha_modifiers);
+		write_linestyle_thickness_modifiers(wd, &linestyle->thickness_modifiers);
+		write_linestyle_geometry_modifiers(wd, &linestyle->geometry_modifiers);
+		for (int a = 0; a < MAX_MTEX; a++) {
+			if (linestyle->mtex[a]) {
+				writestruct(wd, DATA, MTex, 1, linestyle->mtex[a]);
 			}
 		}
+		if (linestyle->nodetree) {
+			writestruct(wd, DATA, bNodeTree, 1, linestyle->nodetree);
+			write_nodetree_nolib(wd, linestyle->nodetree);
+		}
 	}
 }
 
-static void write_cachefiles(WriteData *wd, ListBase *idbase)
+static void write_cachefile(WriteData *wd, CacheFile *cache_file)
 {
-	CacheFile *cache_file;
+	if (cache_file->id.us > 0 || wd->current) {
+		writestruct(wd, ID_CF, CacheFile, 1, cache_file);
 
-	for (cache_file = idbase->first; cache_file; cache_file = cache_file->id.next) {
-		if (cache_file->id.us > 0 || wd->current) {
-			writestruct(wd, ID_CF, CacheFile, 1, cache_file);
-
-			if (cache_file->adt) {
-				write_animdata(wd, cache_file->adt);
-			}
+		if (cache_file->adt) {
+			write_animdata(wd, cache_file->adt);
 		}
 	}
 }
@@ -3915,7 +3674,6 @@ static void write_libraries(WriteData *wd, Main *main)
 	bool found_one;
 
 	for (; main; main = main->next) {
-
 		a = tot = set_listbasepointers(main, lbarray);
 
 		/* test: is lib being used */
@@ -3924,16 +3682,13 @@ static void write_libraries(WriteData *wd, Main *main)
 		}
 		else {
 			found_one = false;
-			while (tot--) {
+			while (!found_one && tot--) {
 				for (id = lbarray[tot]->first; id; id = id->next) {
 					if (id->us > 0 && (id->tag & LIB_TAG_EXTERN)) {
 						found_one = true;
 						break;
 					}
 				}
-				if (found_one) {
-					break;
-				}
 			}
 		}
 
@@ -4072,38 +3827,131 @@ static bool write_file_handle(
 	 * avoid thumbnail detecting changes because of this. */
 	mywrite_flush(wd);
 
-	write_windowmanagers(wd, &mainvar->wm);
-	write_screens(wd, &mainvar->screen);
-	write_movieclips(wd, &mainvar->movieclip);
-	write_masks(wd, &mainvar->mask);
-	write_scenes(wd, &mainvar->scene);
-	write_curves(wd, &mainvar->curve);
-	write_mballs(wd, &mainvar->mball);
-	write_images(wd, &mainvar->image);
-	write_cameras(wd, &mainvar->camera);
-	write_lamps(wd, &mainvar->lamp);
-	write_lattices(wd, &mainvar->latt);
-	write_vfonts(wd, &mainvar->vfont);
-	write_keys(wd, &mainvar->key);
-	write_worlds(wd, &mainvar->world);
-	write_texts(wd, &mainvar->text);
-	write_speakers(wd, &mainvar->speaker);
-	write_sounds(wd, &mainvar->sound);
-	write_groups(wd, &mainvar->group);
-	write_armatures(wd, &mainvar->armature);
-	write_actions(wd, &mainvar->action);
-	write_objects(wd, &mainvar->object);
-	write_materials(wd, &mainvar->mat);
-	write_textures(wd, &mainvar->tex);
-	write_meshes(wd, &mainvar->mesh);
-	write_particlesettings(wd, &mainvar->particle);
-	write_nodetrees(wd, &mainvar->nodetree);
-	write_brushes(wd, &mainvar->brush);
-	write_palettes(wd, &mainvar->palettes);
-	write_paintcurves(wd, &mainvar->paintcurves);
-	write_gpencils(wd, &mainvar->gpencil);
-	write_linestyles(wd, &mainvar->linestyle);
-	write_cachefiles(wd, &mainvar->cachefiles);
+	ListBase *lbarray[MAX_LIBARRAY];
+	int a = set_listbasepointers(mainvar, lbarray);
+	while (a--) {
+		ID *id = lbarray[a]->first;
+
+		if (id && GS(id->name) == ID_LI) {
+			continue;  /* Libraries are handled separately below. */
+		}
+
+		for (; id; id = id->next) {
+			switch ((ID_Type)GS(id->name)) {
+				case ID_WM:
+					write_windowmanager(wd, (wmWindowManager *)id);
+					break;
+				case ID_SCR:
+					write_screen(wd, (bScreen *)id);
+					break;
+				case ID_MC:
+					write_movieclip(wd, (MovieClip *)id);
+					break;
+				case ID_MSK:
+					write_mask(wd, (Mask *)id);
+					break;
+				case ID_SCE:
+					write_scene(wd, (Scene *)id);
+					break;
+				case ID_CU:
+					write_curve(wd,(Curve *)id);
+					break;
+				case ID_MB:
+					write_mball(wd, (MetaBall *)id);
+					break;
+				case ID_IM:
+					write_image(wd, (Image *)id);
+					break;
+				case ID_CA:
+					write_camera(wd, (Camera *)id);
+					break;
+				case ID_LA:
+					write_lamp(wd, (Lamp *)id);
+					break;
+				case ID_LT:
+					write_lattice(wd, (Lattice *)id);
+					break;
+				case ID_VF:
+					write_vfont(wd, (VFont *)id);
+					break;
+				case ID_KE:
+					write_key(wd, (Key *)id);
+					break;
+				case ID_WO:
+					write_world(wd, (World *)id);
+					break;
+				case ID_TXT:
+					write_text(wd, (Text *)id);
+					break;
+				case ID_SPK:
+					write_speaker(wd, (Speaker *)id);
+					break;
+				case ID_SO:
+					write_sound(wd, (bSound *)id);
+					break;
+				case ID_GR:
+					write_group(wd, (Group *)id);
+					break;
+				case ID_AR:
+					write_armature(wd, (bArmature *)id);
+					break;
+				case ID_AC:
+					write_action(wd, (bAction *)id);
+					break;
+				case ID_OB:
+					write_object(wd, (Object *)id);
+					break;
+				case ID_MA:
+					write_material(wd, (Material *)id);
+					break;
+				case ID_TE:
+					write_texture(wd, (Tex *)id);
+					break;
+				case ID_ME:
+					write_mesh(wd, (Mesh *)id);
+					break;
+				case ID_PA:
+					write_particlesettings(wd, (ParticleSettings *)id);
+					break;
+				case ID_NT:
+					write_nodetree(wd, (bNodeTree *)id);
+					break;
+				case ID_BR:
+					write_brush(wd, (Brush *)id);
+					break;
+				case ID_PAL:
+					write_palette(wd, (Palette *)id);
+					break;
+				case ID_PC:
+					write_paintcurve(wd, (PaintCurve *)id);
+					break;
+				case ID_GD:
+					write_gpencil(wd, (bGPdata *)id);
+					break;
+				case ID_LS:
+					write_linestyle(wd, (FreestyleLineStyle *)id);
+					break;
+				case ID_CF:
+					write_cachefile(wd, (CacheFile *)id);
+					break;
+				case ID_LI:
+					/* Do nothing, handled below - and should never be reached. */
+					BLI_assert(0);
+					break;
+				case ID_IP:
+					/* Do nothing, deprecated. */
+					break;
+				default:
+					/* Should never be reached. */
+					BLI_assert(0);
+					break;
+			}
+		}
+
+		mywrite_flush(wd);
+	}
+
+	/* Special handling, operating over split Mains... */
 	write_libraries(wd,  mainvar->next);
 
 	/* So changes above don't cause a 'DNA1' to be detected as changed on undo. */
diff --git a/source/blender/bmesh/CMakeLists.txt b/source/blender/bmesh/CMakeLists.txt
index 30fefe37f0e..ea24da86626 100644
--- a/source/blender/bmesh/CMakeLists.txt
+++ b/source/blender/bmesh/CMakeLists.txt
@@ -152,6 +152,8 @@ set(SRC
 	tools/bmesh_path_region.h
 	tools/bmesh_region_match.c
 	tools/bmesh_region_match.h
+	tools/bmesh_separate.c
+	tools/bmesh_separate.h
 	tools/bmesh_triangulate.c
 	tools/bmesh_triangulate.h
 	tools/bmesh_wireframe.c
diff --git a/source/blender/bmesh/bmesh.h b/source/blender/bmesh/bmesh.h
index f29d280d071..b84a3d5e559 100644
--- a/source/blender/bmesh/bmesh.h
+++ b/source/blender/bmesh/bmesh.h
@@ -192,9 +192,10 @@
  *
  * These conventions should be used throughout the bmesh module.
  *
- * - ``BM_***()`` -     High level BMesh API function for use anywhere.
- * - ``bmesh_***()`` -  Low level API function.
+ * - ``bmesh_kernel_*()`` - Low level API, for primitive functions that others are built ontop of.
+ * - ``bmesh_***()`` - Low level API function.
  * - ``bm_***()`` -     'static' functions, not apart of the API at all, but use prefix since they operate on BMesh data.
+ * - ``BM_***()`` -     High level BMesh API function for use anywhere.
  * - ``BMO_***()`` -    High level operator API function for use anywhere.
  * - ``bmo_***()`` -    Low level / internal operator API functions.
  * - ``_bm_***()`` -    Functions which are called via macros only.
diff --git a/source/blender/bmesh/bmesh_tools.h b/source/blender/bmesh/bmesh_tools.h
index 23212dd085e..a537c3b872c 100644
--- a/source/blender/bmesh/bmesh_tools.h
+++ b/source/blender/bmesh/bmesh_tools.h
@@ -43,6 +43,7 @@ extern "C" {
 #include "tools/bmesh_path.h"
 #include "tools/bmesh_path_region.h"
 #include "tools/bmesh_region_match.h"
+#include "tools/bmesh_separate.h"
 #include "tools/bmesh_triangulate.h"
 
 #ifdef __cplusplus
diff --git a/source/blender/bmesh/intern/bmesh_core.c b/source/blender/bmesh/intern/bmesh_core.c
index d1178a198dc..cee5450a37d 100644
--- a/source/blender/bmesh/intern/bmesh_core.c
+++ b/source/blender/bmesh/intern/bmesh_core.c
@@ -1021,7 +1021,7 @@ static int UNUSED_FUNCTION(bm_loop_length)(BMLoop *l)
  * \param use_loop_mdisp_flip: When set, flip the Z-depth of the mdisp,
  * (use when flipping normals, disable when mirroring, eg: symmetrize).
  */
-void bmesh_loop_reverse(
+void bmesh_kernel_loop_reverse(
         BMesh *bm, BMFace *f,
         const int cd_loop_mdisp_offset, const bool use_loop_mdisp_flip)
 {
@@ -1438,7 +1438,7 @@ static BMFace *bm_face_create__sfme(BMesh *bm, BMFace *f_example)
  *
  * \return A BMFace pointer
  */
-BMFace *bmesh_sfme(
+BMFace *bmesh_kernel_split_face_make_edge(
         BMesh *bm, BMFace *f, BMLoop *l_v1, BMLoop *l_v2,
         BMLoop **r_l,
 #ifdef USE_BMESH_HOLES
@@ -1584,7 +1584,7 @@ BMFace *bmesh_sfme(
  *
  * \return The newly created BMVert pointer.
  */
-BMVert *bmesh_semv(BMesh *bm, BMVert *tv, BMEdge *e, BMEdge **r_e)
+BMVert *bmesh_kernel_split_edge_make_vert(BMesh *bm, BMVert *tv, BMEdge *e, BMEdge **r_e)
 {
 	BMLoop *l_next;
 	BMEdge *e_new;
@@ -1766,7 +1766,7 @@ BMVert *bmesh_semv(BMesh *bm, BMVert *tv, BMEdge *e, BMEdge **r_e)
  * faces with just 2 edges. It is up to the caller to decide what to do with
  * these faces.
  */
-BMEdge *bmesh_jekv(
+BMEdge *bmesh_kernel_join_edge_kill_vert(
         BMesh *bm, BMEdge *e_kill, BMVert *v_kill,
         const bool do_del, const bool check_edge_double,
         const bool kill_degenerate_faces)
@@ -1920,7 +1920,7 @@ BMEdge *bmesh_jekv(
  *
  * Collapse an edge, merging surrounding data.
  *
- * Unlike #BM_vert_collapse_edge & #bmesh_jekv which only handle 2 valence verts,
+ * Unlike #BM_vert_collapse_edge & #bmesh_kernel_join_edge_kill_vert which only handle 2 valence verts,
  * this can handle any number of connected edges/faces.
  *
  * <pre>
@@ -1932,7 +1932,7 @@ BMEdge *bmesh_jekv(
  * +-+-+-+    +-+-+-+
  * </pre>
  */
-BMVert *bmesh_jvke(
+BMVert *bmesh_kernel_join_vert_kill_edge(
         BMesh *bm, BMEdge *e_kill, BMVert *v_kill,
         const bool do_del, const bool check_edge_double,
         const bool kill_degenerate_faces)
@@ -2035,7 +2035,7 @@ BMVert *bmesh_jvke(
  * In the example A, faces \a f1 and \a f2 are joined by a single edge,
  * and the euler can safely be used.
  * In example B however, \a f1 and \a f2 are joined by multiple edges and will produce an error.
- * The caller in this case should call #bmesh_jekv on the extra edges
+ * The caller in this case should call #bmesh_kernel_join_edge_kill_vert on the extra edges
  * before attempting to fuse \a f1 and \a f2.
  *
  * \note The order of arguments decides whether or not certain per-face attributes are present
@@ -2044,7 +2044,7 @@ BMVert *bmesh_jvke(
  *
  * \return A BMFace pointer
  */
-BMFace *bmesh_jfke(BMesh *bm, BMFace *f1, BMFace *f2, BMEdge *e)
+BMFace *bmesh_kernel_join_face_kill_edge(BMesh *bm, BMFace *f1, BMFace *f2, BMEdge *e)
 {
 	BMLoop *l_iter, *l_f1 = NULL, *l_f2 = NULL;
 	int newlen = 0, i, f1len = 0, f2len = 0;
@@ -2249,7 +2249,7 @@ bool BM_vert_splice(BMesh *bm, BMVert *v_dst, BMVert *v_src)
 }
 
 
-/** \name BM_vert_separate, bmesh_vert_separate and friends
+/** \name BM_vert_separate, bmesh_kernel_vert_separate and friends
  * \{ */
 
 /* BM_edge_face_count(e) >= 1 */
@@ -2269,7 +2269,7 @@ BLI_INLINE bool bm_edge_supports_separate(const BMEdge *e)
  *
  * \return Success
  */
-void bmesh_vert_separate(
+void bmesh_kernel_vert_separate(
         BMesh *bm, BMVert *v, BMVert ***r_vout, int *r_vout_len,
         const bool copy_select)
 {
@@ -2385,7 +2385,7 @@ void bmesh_vert_separate(
  *
  * Takes a list of edges, which have been split from their original.
  *
- * Any edges which failed to split off in #bmesh_vert_separate will be merged back into the original edge.
+ * Any edges which failed to split off in #bmesh_kernel_vert_separate will be merged back into the original edge.
  *
  * \param edges_separate
  * A list-of-lists, each list is from a single original edge (the first edge is the original),
@@ -2398,7 +2398,7 @@ void bmesh_vert_separate(
  * \note this function looks like it could become slow,
  * but in common cases its only going to iterate a few times.
  */
-static void bmesh_vert_separate__cleanup(BMesh *bm, LinkNode *edges_separate)
+static void bmesh_kernel_vert_separate__cleanup(BMesh *bm, LinkNode *edges_separate)
 {
 	do {
 		LinkNode *n_orig = edges_separate->link;
@@ -2418,7 +2418,7 @@ static void bmesh_vert_separate__cleanup(BMesh *bm, LinkNode *edges_separate)
 }
 
 /**
- * High level function which wraps both #bmesh_vert_separate and #bmesh_edge_separate
+ * High level function which wraps both #bmesh_kernel_vert_separate and #bmesh_kernel_edge_separate
  */
 void BM_vert_separate(
         BMesh *bm, BMVert *v,
@@ -2435,7 +2435,7 @@ void BM_vert_separate(
 			LinkNode *edges_orig = NULL;
 			do {
 				BMLoop *l_sep = e->l;
-				bmesh_edge_separate(bm, e, l_sep, copy_select);
+				bmesh_kernel_edge_separate(bm, e, l_sep, copy_select);
 				BLI_linklist_prepend_alloca(&edges_orig, l_sep->e);
 				BLI_assert(e != l_sep->e);
 			} while (bm_edge_supports_separate(e));
@@ -2444,10 +2444,10 @@ void BM_vert_separate(
 		}
 	}
 
-	bmesh_vert_separate(bm, v, r_vout, r_vout_len, copy_select);
+	bmesh_kernel_vert_separate(bm, v, r_vout, r_vout_len, copy_select);
 
 	if (edges_separate) {
-		bmesh_vert_separate__cleanup(bm, edges_separate);
+		bmesh_kernel_vert_separate__cleanup(bm, edges_separate);
 	}
 }
 
@@ -2472,7 +2472,7 @@ void BM_vert_separate_hflag(
 				LinkNode *edges_orig = NULL;
 				do {
 					BMLoop *l_sep = e->l;
-					bmesh_edge_separate(bm, e, l_sep, copy_select);
+					bmesh_kernel_edge_separate(bm, e, l_sep, copy_select);
 					/* trick to avoid looping over separated edges */
 					if (edges_separate == NULL && edges_orig == NULL) {
 						e_first = l_sep->e;
@@ -2486,10 +2486,10 @@ void BM_vert_separate_hflag(
 		}
 	} while ((e_iter = BM_DISK_EDGE_NEXT(e_iter, v)) != e_first);
 
-	bmesh_vert_separate(bm, v, r_vout, r_vout_len, copy_select);
+	bmesh_kernel_vert_separate(bm, v, r_vout, r_vout_len, copy_select);
 
 	if (edges_separate) {
-		bmesh_vert_separate__cleanup(bm, edges_separate);
+		bmesh_kernel_vert_separate__cleanup(bm, edges_separate);
 	}
 }
 
@@ -2574,7 +2574,7 @@ bool BM_edge_splice(BMesh *bm, BMEdge *e_dst, BMEdge *e_src)
  * \note Does nothing if \a l_sep is already the only loop in the
  * edge radial.
  */
-void bmesh_edge_separate(
+void bmesh_kernel_edge_separate(
         BMesh *bm, BMEdge *e, BMLoop *l_sep,
         const bool copy_select)
 {
@@ -2620,7 +2620,7 @@ void bmesh_edge_separate(
  *
  * \note Will be a no-op and return original vertex if only two edges at that vertex.
  */
-BMVert *bmesh_urmv_loop(BMesh *bm, BMLoop *l_sep)
+BMVert *bmesh_kernel_unglue_region_make_vert(BMesh *bm, BMLoop *l_sep)
 {
 	BMVert *v_new = NULL;
 	BMVert *v_sep = l_sep->v;
@@ -2630,10 +2630,12 @@ BMVert *bmesh_urmv_loop(BMesh *bm, BMLoop *l_sep)
 
 	/* peel the face from the edge radials on both sides of the
 	 * loop vert, disconnecting the face from its fan */
-	if (!BM_edge_is_boundary(l_sep->e))
-		bmesh_edge_separate(bm, l_sep->e, l_sep, false);
-	if (!BM_edge_is_boundary(l_sep->prev->e))
-		bmesh_edge_separate(bm, l_sep->prev->e, l_sep->prev, false);
+	if (!BM_edge_is_boundary(l_sep->e)) {
+		bmesh_kernel_edge_separate(bm, l_sep->e, l_sep, false);
+	}
+	if (!BM_edge_is_boundary(l_sep->prev->e)) {
+		bmesh_kernel_edge_separate(bm, l_sep->prev->e, l_sep->prev, false);
+	}
 
 	/* do inline, below */
 #if 0
@@ -2681,21 +2683,23 @@ BMVert *bmesh_urmv_loop(BMesh *bm, BMLoop *l_sep)
 }
 
 /**
- * A version of #bmesh_urmv_loop that disconnects multiple loops at once.
+ * A version of #bmesh_kernel_unglue_region_make_vert that disconnects multiple loops at once.
  * The loops must all share the same vertex, can be in any order
  * and are all moved to use a single new vertex - which is returned.
  *
  * This function handles the details of finding fans boundaries.
  */
-BMVert *bmesh_urmv_loop_multi(
+BMVert *bmesh_kernel_unglue_region_make_vert_multi(
         BMesh *bm, BMLoop **larr, int larr_len)
 {
 	BMVert *v_sep = larr[0]->v;
 	BMVert *v_new;
+	int edges_len = 0;
 	int i;
-	bool is_mixed_any = false;
-
-	BLI_SMALLSTACK_DECLARE(edges, BMEdge *);
+	/* any edges not owned by 'larr' loops connected to 'v_sep'? */
+	bool is_mixed_edge_any = false;
+	/* any loops not owned by 'larr' radially connected to 'larr' loop edges? */
+	bool is_mixed_loop_any = false;
 
 #define LOOP_VISIT _FLAG_WALK
 #define EDGE_VISIT _FLAG_WALK
@@ -2713,58 +2717,74 @@ BMVert *bmesh_urmv_loop_multi(
 		 * while doing a radial loop (where loops may be adjacent) */
 		BM_ELEM_API_FLAG_ENABLE(l_sep->next, LOOP_VISIT);
 		BM_ELEM_API_FLAG_ENABLE(l_sep->prev, LOOP_VISIT);
-	}
-
-	for (i = 0; i < larr_len; i++) {
-		BMLoop *l_sep = larr[i];
 
 		BMLoop *loop_pair[2] = {l_sep, l_sep->prev};
-		int j;
-		for (j = 0; j < ARRAY_SIZE(loop_pair); j++) {
+		for (int j = 0; j < ARRAY_SIZE(loop_pair); j++) {
 			BMEdge *e = loop_pair[j]->e;
 			if (!BM_ELEM_API_FLAG_TEST(e, EDGE_VISIT)) {
-				BMLoop *l_iter, *l_first;
-				bool is_mixed = false;
-
 				BM_ELEM_API_FLAG_ENABLE(e, EDGE_VISIT);
+				edges_len += 1;
+			}
+		}
+	}
 
-				l_iter = l_first = e->l;
+	BMEdge **edges = BLI_array_alloca(edges, edges_len);
+	STACK_DECLARE(edges);
+
+	STACK_INIT(edges, edges_len);
+
+	{
+		BMEdge *e_first, *e_iter;
+		e_iter = e_first = v_sep->e;
+		do {
+			if (BM_ELEM_API_FLAG_TEST(e_iter, EDGE_VISIT)) {
+				BMLoop *l_iter, *l_first;
+				bool is_mixed_loop = false;
+
+				l_iter = l_first = e_iter->l;
 				do {
 					if (!BM_ELEM_API_FLAG_TEST(l_iter, LOOP_VISIT)) {
-						is_mixed = true;
-						is_mixed_any = true;
+						is_mixed_loop = true;
 						break;
 					}
 				} while ((l_iter = l_iter->radial_next) != l_first);
 
-				if (is_mixed) {
+				if (is_mixed_loop) {
 					/* ensure the first loop is one we don't own so we can do a quick check below
 					 * on the edge's loop-flag to see if the edge is mixed or not. */
-					e->l = l_iter;
+					e_iter->l = l_iter;
+
+					is_mixed_loop_any = true;
 				}
-				BLI_SMALLSTACK_PUSH(edges, e);
+
+				STACK_PUSH(edges, e_iter);
 			}
-		}
+			else {
+				/* at least one edge attached isn't connected to our loops */
+				is_mixed_edge_any = true;
+			}
+		} while ((e_iter = bmesh_disk_edge_next(e_iter, v_sep)) != e_first);
 	}
 
-	if (is_mixed_any == false) {
+	BLI_assert(edges_len == STACK_SIZE(edges));
+
+	if (is_mixed_loop_any == false && is_mixed_edge_any == false) {
 		/* all loops in 'larr' are the sole owners of their edges.
 		 * nothing to split away from, this is a no-op */
 		v_new = v_sep;
 	}
 	else {
-		BMEdge *e;
-
-		BLI_assert(!BLI_SMALLSTACK_IS_EMPTY(edges));
-
 		v_new = BM_vert_create(bm, v_sep->co, v_sep, BM_CREATE_NOP);
-		while ((e = BLI_SMALLSTACK_POP(edges))) {
+
+		for (i = 0; i < STACK_SIZE(edges); i++) {
+			BMEdge *e = edges[i];
 			BMLoop *l_iter, *l_first, *l_next;
 			BMEdge *e_new;
 
 			/* disable so copied edge isn't left dirty (loop edges are cleared last too) */
 			BM_ELEM_API_FLAG_DISABLE(e, EDGE_VISIT);
 
+			/* will always be false when (is_mixed_loop_any == false) */
 			if (!BM_ELEM_API_FLAG_TEST(e->l, LOOP_VISIT)) {
 				/* edge has some loops owned by us, some owned by other loops */
 				BMVert *e_new_v_pair[2];
@@ -2853,9 +2873,9 @@ static void bmesh_edge_vert_swap__recursive(BMEdge *e, BMVert *v_dst, BMVert *v_
 
 /**
  * This function assumes l_sep is apart of a larger fan which has already been
- * isolated by calling bmesh_edge_separate to segregate it radially.
+ * isolated by calling #bmesh_kernel_edge_separate to segregate it radially.
  */
-BMVert *bmesh_urmv_loop_region(BMesh *bm, BMLoop *l_sep)
+BMVert *bmesh_kernel_unglue_region_make_vert_multi_isolated(BMesh *bm, BMLoop *l_sep)
 {
 	BMVert *v_new = BM_vert_create(bm, l_sep->v->co, l_sep->v, BM_CREATE_NOP);
 	/* passing either 'l_sep->e', 'l_sep->prev->e' will work */
@@ -2864,20 +2884,6 @@ BMVert *bmesh_urmv_loop_region(BMesh *bm, BMLoop *l_sep)
 	return v_new;
 }
 
-
-/**
- * \brief Unglue Region Make Vert (URMV)
- *
- * Disconnects f_sep from the vertex fan at \a v_sep
- *
- * \return The newly created BMVert
- */
-BMVert *bmesh_urmv(BMesh *bm, BMFace *f_sep, BMVert *v_sep)
-{
-	BMLoop *l = BM_face_vert_share_loop(f_sep, v_sep);
-	return bmesh_urmv_loop(bm, l);
-}
-
 /**
  * Avoid calling this where possible,
  * low level function so both face pointers remain intact but point to swapped data.
diff --git a/source/blender/bmesh/intern/bmesh_core.h b/source/blender/bmesh/intern/bmesh_core.h
index f72e9d7b198..fb6b66809f3 100644
--- a/source/blender/bmesh/intern/bmesh_core.h
+++ b/source/blender/bmesh/intern/bmesh_core.h
@@ -64,21 +64,16 @@ void    BM_face_kill(BMesh *bm, BMFace *f);
 void    BM_edge_kill(BMesh *bm, BMEdge *e);
 void    BM_vert_kill(BMesh *bm, BMVert *v);
 
-void    bmesh_edge_separate(
-        BMesh *bm, BMEdge *e, BMLoop *l_sep,
-        const bool copy_select);
 bool    BM_edge_splice(BMesh *bm, BMEdge *e_dst, BMEdge *e_src);
 bool    BM_vert_splice(BMesh *bm, BMVert *v_dst, BMVert *v_src);
 bool    BM_vert_splice_check_double(BMVert *v_a, BMVert *v_b);
 
-void    bmesh_vert_separate(
-        BMesh *bm, BMVert *v, BMVert ***r_vout, int *r_vout_len,
-        const bool copy_select);
-
-void    bmesh_loop_reverse(
+void    bmesh_kernel_loop_reverse(
         BMesh *bm, BMFace *f,
         const int cd_loop_mdisp_offset, const bool use_loop_mdisp_flip);
 
+void    bmesh_face_swap_data(BMFace *f_a, BMFace *f_b);
+
 BMFace *BM_faces_join(BMesh *bm, BMFace **faces, int totface, const bool do_del);
 void    BM_vert_separate(
         BMesh *bm, BMVert *v, BMEdge **e_in, int e_in_len, const bool copy_select,
@@ -90,34 +85,43 @@ void BM_vert_separate_wire_hflag(
         BMesh *bm, BMVert *v_dst, BMVert *v_src,
         const char hflag);
 
-/* EULER API - For modifying structure */
-BMFace *bmesh_sfme(
+/**
+ * BMesh Kernel: For modifying structure.
+ *
+ * Names are on the verbose side but these are only for low-level access.
+ */
+void    bmesh_kernel_vert_separate(
+        BMesh *bm, BMVert *v, BMVert ***r_vout, int *r_vout_len,
+        const bool copy_select);
+void    bmesh_kernel_edge_separate(
+        BMesh *bm, BMEdge *e, BMLoop *l_sep,
+        const bool copy_select);
+
+BMFace *bmesh_kernel_split_face_make_edge(
         BMesh *bm, BMFace *f,
         BMLoop *l1, BMLoop *l2,
         BMLoop **r_l,
 #ifdef USE_BMESH_HOLES
-                   ListBase *holes,
+        ListBase *holes,
 #endif
-                   BMEdge *example,
-                   const bool no_double
-                   );
+        BMEdge *example,
+        const bool no_double
+        );
 
-BMVert *bmesh_semv(BMesh *bm, BMVert *tv, BMEdge *e, BMEdge **r_e);
-BMEdge *bmesh_jekv(
+BMVert *bmesh_kernel_split_edge_make_vert(
+        BMesh *bm, BMVert *tv, BMEdge *e, BMEdge **r_e);
+BMEdge *bmesh_kernel_join_edge_kill_vert(
         BMesh *bm, BMEdge *e_kill, BMVert *v_kill,
         const bool do_del, const bool check_edge_splice,
         const bool kill_degenerate_faces);
-BMVert *bmesh_jvke(
+BMVert *bmesh_kernel_join_vert_kill_edge(
         BMesh *bm, BMEdge *e_kill, BMVert *v_kill,
         const bool do_del, const bool check_edge_double,
         const bool kill_degenerate_faces);
-BMFace *bmesh_jfke(BMesh *bm, BMFace *f1, BMFace *f2, BMEdge *e);
-BMVert *bmesh_urmv(BMesh *bm, BMFace *f_sep, BMVert *v_sep);
-BMVert *bmesh_urmv_loop(BMesh *bm, BMLoop *l_sep);
-BMVert *bmesh_urmv_loop_multi(
-        BMesh *bm, BMLoop **larr, int larr_len);
-BMVert *bmesh_urmv_loop_region(BMesh *bm, BMLoop *l_sep);
+BMFace *bmesh_kernel_join_face_kill_edge(BMesh *bm, BMFace *f1, BMFace *f2, BMEdge *e);
 
-void    bmesh_face_swap_data(BMFace *f_a, BMFace *f_b);
+BMVert *bmesh_kernel_unglue_region_make_vert(BMesh *bm, BMLoop *l_sep);
+BMVert *bmesh_kernel_unglue_region_make_vert_multi(BMesh *bm, BMLoop **larr, int larr_len);
+BMVert *bmesh_kernel_unglue_region_make_vert_multi_isolated(BMesh *bm, BMLoop *l_sep);
 
 #endif /* __BMESH_CORE_H__ */
diff --git a/source/blender/bmesh/intern/bmesh_mesh.c b/source/blender/bmesh/intern/bmesh_mesh.c
index 57a6d8d2e1a..e9fb6a28154 100644
--- a/source/blender/bmesh/intern/bmesh_mesh.c
+++ b/source/blender/bmesh/intern/bmesh_mesh.c
@@ -486,8 +486,7 @@ static void bm_mesh_edges_sharp_tag(
         BMesh *bm, const float (*vnos)[3], const float (*fnos)[3], float split_angle,
         float (*r_lnos)[3])
 {
-	BMIter eiter, viter;
-	BMVert *v;
+	BMIter eiter;
 	BMEdge *e;
 	int i;
 
@@ -498,19 +497,13 @@ static void bm_mesh_edges_sharp_tag(
 	}
 
 	{
-		char htype = BM_LOOP;
+		char htype = BM_VERT | BM_LOOP;
 		if (fnos) {
 			htype |= BM_FACE;
 		}
 		BM_mesh_elem_index_ensure(bm, htype);
 	}
 
-	/* Clear all vertices' tags (means they are all smooth for now). */
-	BM_ITER_MESH_INDEX (v, &viter, bm, BM_VERTS_OF_MESH, i) {
-		BM_elem_index_set(v, i); /* set_inline */
-		BM_elem_flag_disable(v, BM_ELEM_TAG);
-	}
-
 	/* This first loop checks which edges are actually smooth, and pre-populate lnos with vnos (as if they were
 	 * all smooth).
 	 */
@@ -551,20 +544,45 @@ static void bm_mesh_edges_sharp_tag(
 				no = vnos ? vnos[BM_elem_index_get(l_b->v)] : l_b->v->no;
 				copy_v3_v3(r_lnos[BM_elem_index_get(l_b)], no);
 			}
-			else {
-				/* Sharp edge, tag its verts as such. */
-				BM_elem_flag_enable(e->v1, BM_ELEM_TAG);
-				BM_elem_flag_enable(e->v2, BM_ELEM_TAG);
+		}
+	}
+
+	bm->elem_index_dirty &= ~BM_EDGE;
+}
+
+/* Check whether gievn loop is part of an unknown-so-far cyclic smooth fan, or not.
+ * Needed because cyclic smooth fans have no obvious 'entry point', and yet we need to walk them once, and only once. */
+static bool bm_mesh_loop_check_cyclic_smooth_fan(BMLoop *l_curr)
+{
+	BMLoop *lfan_pivot_next = l_curr;
+	BMEdge *e_next = l_curr->e;
+
+	BLI_assert(!BM_elem_flag_test(lfan_pivot_next, BM_ELEM_TAG));
+	BM_elem_flag_enable(lfan_pivot_next, BM_ELEM_TAG);
+
+	while (true) {
+		/* Much simpler than in sibling code with basic Mesh data! */
+		lfan_pivot_next = BM_vert_step_fan_loop(lfan_pivot_next, &e_next);
+
+		if (!lfan_pivot_next || !BM_elem_flag_test(e_next, BM_ELEM_TAG)) {
+			/* Sharp loop/edge, so not a cyclic smooth fan... */
+			return false;
+		}
+		/* Smooth loop/edge... */
+		else if (BM_elem_flag_test(lfan_pivot_next, BM_ELEM_TAG)) {
+			if (lfan_pivot_next == l_curr) {
+				/* We walked around a whole cyclic smooth fan without finding any already-processed loop, means we can
+				 * use initial l_curr/l_prev edge as start for this smooth fan. */
+				return true;
 			}
+			/* ... already checked in some previous looping, we can abort. */
+			return false;
 		}
 		else {
-			/* Sharp edge, tag its verts as such. */
-			BM_elem_flag_enable(e->v1, BM_ELEM_TAG);
-			BM_elem_flag_enable(e->v2, BM_ELEM_TAG);
+			/* ... we can skip it in future, and keep checking the smooth fan. */
+			BM_elem_flag_enable(lfan_pivot_next, BM_ELEM_TAG);
 		}
 	}
-
-	bm->elem_index_dirty &= ~(BM_EDGE | BM_VERT);
 }
 
 /* BMesh version of BKE_mesh_normals_loop_split() in mesh_evaluate.c
@@ -587,13 +605,11 @@ static void bm_mesh_loops_calc_normals(
 	BLI_Stack *edge_vectors = NULL;
 
 	{
-		char htype = BM_LOOP;
+		char htype = 0;
 		if (vcos) {
 			htype |= BM_VERT;
 		}
-		if (fnos) {
-			htype |= BM_FACE;
-		}
+		/* Face/Loop indices are set inline below. */
 		BM_mesh_elem_index_ensure(bm, htype);
 	}
 
@@ -606,6 +622,21 @@ static void bm_mesh_loops_calc_normals(
 		edge_vectors = BLI_stack_new(sizeof(float[3]), __func__);
 	}
 
+	/* Clear all loops' tags (means none are to be skipped for now). */
+	int index_face, index_loop = 0;
+	BM_ITER_MESH_INDEX (f_curr, &fiter, bm, BM_FACES_OF_MESH, index_face) {
+		BMLoop *l_curr, *l_first;
+
+		BM_elem_index_set(f_curr, index_face); /* set_inline */
+
+		l_curr = l_first = BM_FACE_FIRST_LOOP(f_curr);
+		do {
+			BM_elem_index_set(l_curr, index_loop++); /* set_inline */
+			BM_elem_flag_disable(l_curr, BM_ELEM_TAG);
+		} while ((l_curr = l_curr->next) != l_first);
+	}
+	bm->elem_index_dirty &= ~(BM_FACE|BM_LOOP);
+
 	/* We now know edges that can be smoothed (they are tagged), and edges that will be hard (they aren't).
 	 * Now, time to generate the normals.
 	 */
@@ -614,16 +645,16 @@ static void bm_mesh_loops_calc_normals(
 
 		l_curr = l_first = BM_FACE_FIRST_LOOP(f_curr);
 		do {
+			/* A smooth edge, we have to check for cyclic smooth fan case.
+			 * If we find a new, never-processed cyclic smooth fan, we can do it now using that loop/edge as
+			 * 'entry point', otherwise we can skip it. */
+			/* Note: In theory, we could make bm_mesh_loop_check_cyclic_smooth_fan() store mlfan_pivot's in a stack,
+			 * to avoid having to fan again around the vert during actual computation of clnor & clnorspace.
+			 * However, this would complicate the code, add more memory usage, and BM_vert_step_fan_loop()
+			 * is quite cheap in term of CPU cycles, so really think it's not worth it. */
 			if (BM_elem_flag_test(l_curr->e, BM_ELEM_TAG) &&
-			    (!r_lnors_spacearr || BM_elem_flag_test(l_curr->v, BM_ELEM_TAG)))
+			    (BM_elem_flag_test(l_curr, BM_ELEM_TAG) || !bm_mesh_loop_check_cyclic_smooth_fan(l_curr)))
 			{
-				/* A smooth edge, and we are not generating lnors_spacearr, or the related vertex is sharp.
-				 * We skip it because it is either:
-				 * - in the middle of a 'smooth fan' already computed (or that will be as soon as we hit
-				 *   one of its ends, i.e. one of its two sharp edges), or...
-				 * - the related vertex is a "full smooth" one, in which case pre-populated normals from vertex
-				 *   are just fine!
-				 */
 			}
 			else if (!BM_elem_flag_test(l_curr->e, BM_ELEM_TAG) &&
 			         !BM_elem_flag_test(l_curr->prev->e, BM_ELEM_TAG))
@@ -1481,23 +1512,6 @@ int BM_mesh_elem_count(BMesh *bm, const char htype)
 	}
 }
 
-/**
- * Special case: Python uses custom-data layers to hold PyObject references.
- * These have to be kept in-place, else the PyObject's we point to, wont point back to us.
- *
- * \note ``ele_src`` Is a duplicate, so we don't need to worry about getting in a feedback loop.
- *
- * \note If there are other customdata layers which need this functionality, it should be generalized.
- * However #BM_mesh_remap is currently the only place where this is done.
- */
-static void bm_mesh_remap_cd_update(
-        BMHeader *ele_dst, BMHeader *ele_src,
-        const int cd_elem_pyptr)
-{
-	void **pyptr_dst_p = BM_ELEM_CD_GET_VOID_P(((BMElem *)ele_dst), cd_elem_pyptr);
-	void **pyptr_src_p = BM_ELEM_CD_GET_VOID_P(((BMElem *)ele_src), cd_elem_pyptr);
-	*pyptr_dst_p = *pyptr_src_p;
-}
 
 /**
  * Remaps the vertices, edges and/or faces of the bmesh as indicated by vert/edge/face_idx arrays
@@ -1539,6 +1553,8 @@ void BM_mesh_remap(
 		BMVert **verts_pool, *verts_copy, **vep;
 		int i, totvert = bm->totvert;
 		const unsigned int *new_idx;
+		/* Special case: Python uses custom - data layers to hold PyObject references.
+		 * These have to be kept in - place, else the PyObject's we point to, wont point back to us. */
 		const int cd_vert_pyptr  = CustomData_get_offset(&bm->vdata, CD_BM_ELEM_PYPTR);
 
 		/* Init the old-to-new vert pointers mapping */
@@ -1547,9 +1563,14 @@ void BM_mesh_remap(
 		/* Make a copy of all vertices. */
 		verts_pool = bm->vtable;
 		verts_copy = MEM_mallocN(sizeof(BMVert) * totvert, "BM_mesh_remap verts copy");
+		void **pyptrs = (cd_vert_pyptr != -1) ? MEM_mallocN(sizeof(void *) * totvert, __func__) : NULL;
 		for (i = totvert, ve = verts_copy + totvert - 1, vep = verts_pool + totvert - 1; i--; ve--, vep--) {
 			*ve = **vep;
 /*			printf("*vep: %p, verts_pool[%d]: %p\n", *vep, i, verts_pool[i]);*/
+			if (cd_vert_pyptr != -1) {
+				void **pyptr = BM_ELEM_CD_GET_VOID_P(((BMElem *)ve), cd_vert_pyptr);
+				pyptrs[i] = *pyptr;
+			}
 		}
 
 		/* Copy back verts to their new place, and update old2new pointers mapping. */
@@ -1562,13 +1583,17 @@ void BM_mesh_remap(
 /*			printf("mapping vert from %d to %d (%p/%p to %p)\n", i, *new_idx, *vep, verts_pool[i], new_vep);*/
 			BLI_ghash_insert(vptr_map, *vep, new_vep);
 			if (cd_vert_pyptr != -1) {
-				bm_mesh_remap_cd_update(&(*vep)->head, &new_vep->head, cd_vert_pyptr);
+				void **pyptr = BM_ELEM_CD_GET_VOID_P(((BMElem *)new_vep), cd_vert_pyptr);
+				*pyptr = pyptrs[*new_idx];
 			}
 		}
 		bm->elem_index_dirty |= BM_VERT;
 		bm->elem_table_dirty |= BM_VERT;
 
 		MEM_freeN(verts_copy);
+		if (pyptrs) {
+			MEM_freeN(pyptrs);
+		}
 	}
 
 	/* Remap Edges */
@@ -1576,6 +1601,8 @@ void BM_mesh_remap(
 		BMEdge **edges_pool, *edges_copy, **edp;
 		int i, totedge = bm->totedge;
 		const unsigned int *new_idx;
+		/* Special case: Python uses custom - data layers to hold PyObject references.
+		 * These have to be kept in - place, else the PyObject's we point to, wont point back to us. */
 		const int cd_edge_pyptr  = CustomData_get_offset(&bm->edata, CD_BM_ELEM_PYPTR);
 
 		/* Init the old-to-new vert pointers mapping */
@@ -1584,8 +1611,13 @@ void BM_mesh_remap(
 		/* Make a copy of all vertices. */
 		edges_pool = bm->etable;
 		edges_copy = MEM_mallocN(sizeof(BMEdge) * totedge, "BM_mesh_remap edges copy");
+		void **pyptrs = (cd_edge_pyptr != -1) ? MEM_mallocN(sizeof(void *) * totedge, __func__) : NULL;
 		for (i = totedge, ed = edges_copy + totedge - 1, edp = edges_pool + totedge - 1; i--; ed--, edp--) {
 			*ed = **edp;
+			if (cd_edge_pyptr != -1) {
+				void **pyptr = BM_ELEM_CD_GET_VOID_P(((BMElem *)ed), cd_edge_pyptr);
+				pyptrs[i] = *pyptr;
+			}
 		}
 
 		/* Copy back verts to their new place, and update old2new pointers mapping. */
@@ -1598,13 +1630,17 @@ void BM_mesh_remap(
 			BLI_ghash_insert(eptr_map, *edp, new_edp);
 /*			printf("mapping edge from %d to %d (%p/%p to %p)\n", i, *new_idx, *edp, edges_pool[i], new_edp);*/
 			if (cd_edge_pyptr != -1) {
-				bm_mesh_remap_cd_update(&(*edp)->head, &new_edp->head, cd_edge_pyptr);
+				void **pyptr = BM_ELEM_CD_GET_VOID_P(((BMElem *)new_edp), cd_edge_pyptr);
+				*pyptr = pyptrs[*new_idx];
 			}
 		}
 		bm->elem_index_dirty |= BM_EDGE;
 		bm->elem_table_dirty |= BM_EDGE;
 
 		MEM_freeN(edges_copy);
+		if (pyptrs) {
+			MEM_freeN(pyptrs);
+		}
 	}
 
 	/* Remap Faces */
@@ -1612,6 +1648,8 @@ void BM_mesh_remap(
 		BMFace **faces_pool, *faces_copy, **fap;
 		int i, totface = bm->totface;
 		const unsigned int *new_idx;
+		/* Special case: Python uses custom - data layers to hold PyObject references.
+		 * These have to be kept in - place, else the PyObject's we point to, wont point back to us. */
 		const int cd_poly_pyptr  = CustomData_get_offset(&bm->pdata, CD_BM_ELEM_PYPTR);
 
 		/* Init the old-to-new vert pointers mapping */
@@ -1620,8 +1658,13 @@ void BM_mesh_remap(
 		/* Make a copy of all vertices. */
 		faces_pool = bm->ftable;
 		faces_copy = MEM_mallocN(sizeof(BMFace) * totface, "BM_mesh_remap faces copy");
+		void **pyptrs = (cd_poly_pyptr != -1) ? MEM_mallocN(sizeof(void *) * totface, __func__) : NULL;
 		for (i = totface, fa = faces_copy + totface - 1, fap = faces_pool + totface - 1; i--; fa--, fap--) {
 			*fa = **fap;
+			if (cd_poly_pyptr != -1) {
+				void **pyptr = BM_ELEM_CD_GET_VOID_P(((BMElem *)fa), cd_poly_pyptr);
+				pyptrs[i] = *pyptr;
+			}
 		}
 
 		/* Copy back verts to their new place, and update old2new pointers mapping. */
@@ -1633,7 +1676,8 @@ void BM_mesh_remap(
 			*new_fap = *fa;
 			BLI_ghash_insert(fptr_map, *fap, new_fap);
 			if (cd_poly_pyptr != -1) {
-				bm_mesh_remap_cd_update(&(*fap)->head, &new_fap->head, cd_poly_pyptr);
+				void **pyptr = BM_ELEM_CD_GET_VOID_P(((BMElem *)new_fap), cd_poly_pyptr);
+				*pyptr = pyptrs[*new_idx];
 			}
 		}
 
@@ -1641,6 +1685,9 @@ void BM_mesh_remap(
 		bm->elem_table_dirty |= BM_FACE;
 
 		MEM_freeN(faces_copy);
+		if (pyptrs) {
+			MEM_freeN(pyptrs);
+		}
 	}
 
 	/* And now, fix all vertices/edges/faces/loops pointers! */
@@ -2008,4 +2055,4 @@ void BM_mesh_toolflags_set(BMesh *bm, bool use_toolflags)
 	        vpool_dst, epool_dst, NULL, fpool_dst);
 
 	bm->use_toolflags = use_toolflags;
-}
-\ No newline at end of file
+}
diff --git a/source/blender/bmesh/intern/bmesh_mods.c b/source/blender/bmesh/intern/bmesh_mods.c
index 500da6b8788..1cd51528e06 100644
--- a/source/blender/bmesh/intern/bmesh_mods.c
+++ b/source/blender/bmesh/intern/bmesh_mods.c
@@ -234,7 +234,7 @@ BMFace *BM_faces_join_pair(BMesh *bm, BMLoop *l_a, BMLoop *l_b, const bool do_de
 
 	if (l_a->v == l_b->v) {
 		const int cd_loop_mdisp_offset = CustomData_get_offset(&bm->ldata, CD_MDISPS);
-		bmesh_loop_reverse(bm, l_b->f, cd_loop_mdisp_offset, true);
+		bmesh_kernel_loop_reverse(bm, l_b->f, cd_loop_mdisp_offset, true);
 	}
 
 	BMFace *faces[2] = {l_a->f, l_b->f};
@@ -288,9 +288,9 @@ BMFace *BM_face_split(
 	}
 	
 #ifdef USE_BMESH_HOLES
-	f_new = bmesh_sfme(bm, f, l_a, l_b, r_l, NULL, example, no_double);
+	f_new = bmesh_kernel_split_face_make_edge(bm, f, l_a, l_b, r_l, NULL, example, no_double);
 #else
-	f_new = bmesh_sfme(bm, f, l_a, l_b, r_l, example, no_double);
+	f_new = bmesh_kernel_split_face_make_edge(bm, f, l_a, l_b, r_l, example, no_double);
 #endif
 	
 	if (f_new) {
@@ -370,19 +370,19 @@ BMFace *BM_face_split_n(
 	f_tmp = BM_face_copy(bm, bm, f, true, true);
 	
 #ifdef USE_BMESH_HOLES
-	f_new = bmesh_sfme(bm, f, l_a, l_b, &l_new, NULL, example, false);
+	f_new = bmesh_kernel_split_face_make_edge(bm, f, l_a, l_b, &l_new, NULL, example, false);
 #else
-	f_new = bmesh_sfme(bm, f, l_a, l_b, &l_new, example, false);
+	f_new = bmesh_kernel_split_face_make_edge(bm, f, l_a, l_b, &l_new, example, false);
 #endif
-	/* bmesh_sfme returns in 'l_new' a Loop for f_new going from 'v_a' to 'v_b'.
+	/* bmesh_kernel_split_face_make_edge returns in 'l_new' a Loop for f_new going from 'v_a' to 'v_b'.
 	 * The radial_next is for 'f' and goes from 'v_b' to 'v_a'  */
 
 	if (f_new) {
 		e = l_new->e;
 		for (i = 0; i < n; i++) {
-			v_new = bmesh_semv(bm, v_b, e, &e_new);
+			v_new = bmesh_kernel_split_edge_make_vert(bm, v_b, e, &e_new);
 			BLI_assert(v_new != NULL);
-			/* bmesh_semv returns in 'e_new' the edge going from 'v_new' to 'v_b' */
+			/* bmesh_kernel_split_edge_make_vert returns in 'e_new' the edge going from 'v_new' to 'v_b' */
 			copy_v3_v3(v_new->co, cos[i]);
 
 			/* interpolate the loop data for the loops with (v == v_new), using orig face */
@@ -507,7 +507,7 @@ BMEdge *BM_vert_collapse_faces(
 		/* single face or no faces */
 		/* same as BM_vert_collapse_edge() however we already
 		 * have vars to perform this operation so don't call. */
-		e_new = bmesh_jekv(bm, e_kill, v_kill, do_del, true, kill_degenerate_faces);
+		e_new = bmesh_kernel_join_edge_kill_vert(bm, e_kill, v_kill, do_del, true, kill_degenerate_faces);
 		/* e_new = BM_edge_exists(tv, tv2); */ /* same as return above */
 	}
 
@@ -542,7 +542,7 @@ BMEdge *BM_vert_collapse_edge(
 			BMVert *tv2 = BM_edge_other_vert(e2, v_kill);
 			if (tv2) {
 				/* only action, other calls here only get the edge to return */
-				e_new = bmesh_jekv(bm, e_kill, v_kill, do_del, true, kill_degenerate_faces);
+				e_new = bmesh_kernel_join_edge_kill_vert(bm, e_kill, v_kill, do_del, true, kill_degenerate_faces);
 			}
 		}
 	}
@@ -564,7 +564,7 @@ BMVert *BM_edge_collapse(
         BMesh *bm, BMEdge *e_kill, BMVert *v_kill,
         const bool do_del, const bool kill_degenerate_faces)
 {
-	return bmesh_jvke(bm, e_kill, v_kill, do_del, true, kill_degenerate_faces);
+	return bmesh_kernel_join_vert_kill_edge(bm, e_kill, v_kill, do_del, true, kill_degenerate_faces);
 }
 
 /**
@@ -616,7 +616,7 @@ BMVert *BM_edge_split(BMesh *bm, BMEdge *e, BMVert *v, BMEdge **r_e, float fac)
 	}
 
 	v_other = BM_edge_other_vert(e, v);
-	v_new = bmesh_semv(bm, v, e, &e_new);
+	v_new = bmesh_kernel_split_edge_make_vert(bm, v, e, &e_new);
 	if (r_e != NULL) {
 		*r_e = e_new;
 	}
@@ -1090,23 +1090,18 @@ BMEdge *BM_edge_rotate(BMesh *bm, BMEdge *e, const bool ccw, const short check_f
 /**
  * \brief Rip a single face from a vertex fan
  */
-BMVert *BM_face_vert_separate(BMesh *bm, BMFace *sf, BMVert *sv)
+BMVert *BM_face_loop_separate(BMesh *bm, BMLoop *l_sep)
 {
-	return bmesh_urmv(bm, sf, sv);
+	return bmesh_kernel_unglue_region_make_vert(bm, l_sep);
 }
 
-/**
- * \brief Rip a single face from a vertex fan
- *
- * \note same as #BM_face_vert_separate but faster (avoids a loop lookup)
- */
-BMVert *BM_face_loop_separate(BMesh *bm, BMLoop *sl)
+BMVert *BM_face_loop_separate_multi_isolated(BMesh *bm, BMLoop *l_sep)
 {
-	return bmesh_urmv_loop(bm, sl);
+	return bmesh_kernel_unglue_region_make_vert_multi_isolated(bm, l_sep);
 }
 
-BMVert *BM_face_loop_separate_multi(
-        BMesh *bm, BMLoop **larr, int larr_len)
+BMVert *BM_face_loop_separate_multi(BMesh *bm, BMLoop **larr, int larr_len)
 {
-	return bmesh_urmv_loop_multi(bm, larr, larr_len);
+	return bmesh_kernel_unglue_region_make_vert_multi(bm, larr, larr_len);
 }
+
diff --git a/source/blender/bmesh/intern/bmesh_mods.h b/source/blender/bmesh/intern/bmesh_mods.h
index 5e95e9a2cc7..330a714418d 100644
--- a/source/blender/bmesh/intern/bmesh_mods.h
+++ b/source/blender/bmesh/intern/bmesh_mods.h
@@ -86,9 +86,8 @@ enum {
 };
 
 
-BMVert *BM_face_vert_separate(BMesh *bm, BMFace *sf, BMVert *sv);
-BMVert *BM_face_loop_separate(BMesh *bm, BMLoop *sl);
-BMVert *BM_face_loop_separate_multi(
-        BMesh *bm, BMLoop **larr, int larr_len);
+BMVert *BM_face_loop_separate(BMesh *bm, BMLoop *l_sep);
+BMVert *BM_face_loop_separate_multi_isolated(BMesh *bm, BMLoop *l_sep);
+BMVert *BM_face_loop_separate_multi(BMesh *bm, BMLoop **larr, int larr_len);
 
 #endif /* __BMESH_MODS_H__ */
diff --git a/source/blender/bmesh/intern/bmesh_opdefines.c b/source/blender/bmesh/intern/bmesh_opdefines.c
index 0d0fdda2c4c..6b388a75436 100644
--- a/source/blender/bmesh/intern/bmesh_opdefines.c
+++ b/source/blender/bmesh/intern/bmesh_opdefines.c
@@ -1284,7 +1284,7 @@ static BMOpDefine bmo_bisect_plane_def = {
 	 {"clear_inner",   BMO_OP_SLOT_BOOL},    /* when enabled. remove all geometry on the negative side of the plane */
 	 {{'\0'}},
 	},
-	{{"geom_cut.out", BMO_OP_SLOT_ELEMENT_BUF, {BM_VERT | BM_EDGE}},  /* output new geometry from the cut */
+	{{"geom_cut.out", BMO_OP_SLOT_ELEMENT_BUF, {BM_VERT | BM_EDGE}},  /* output geometry aligned with the plane (new and existing) */
 	 {"geom.out",     BMO_OP_SLOT_ELEMENT_BUF, {BM_VERT | BM_EDGE | BM_FACE}},  /* input and output geometry (result of cut)  */
 	 {{'\0'}}},
 	bmo_bisect_plane_exec,
diff --git a/source/blender/bmesh/intern/bmesh_polygon.c b/source/blender/bmesh/intern/bmesh_polygon.c
index 6acd790fc0c..78ee7589d23 100644
--- a/source/blender/bmesh/intern/bmesh_polygon.c
+++ b/source/blender/bmesh/intern/bmesh_polygon.c
@@ -844,7 +844,7 @@ void BM_face_normal_flip_ex(
         BMesh *bm, BMFace *f,
         const int cd_loop_mdisp_offset, const bool use_loop_mdisp_flip)
 {
-	bmesh_loop_reverse(bm, f, cd_loop_mdisp_offset, use_loop_mdisp_flip);
+	bmesh_kernel_loop_reverse(bm, f, cd_loop_mdisp_offset, use_loop_mdisp_flip);
 	negate_v3(f->no);
 }
 
diff --git a/source/blender/bmesh/operators/bmo_bisect_plane.c b/source/blender/bmesh/operators/bmo_bisect_plane.c
index bed1ea5cb94..2c80ff651b8 100644
--- a/source/blender/bmesh/operators/bmo_bisect_plane.c
+++ b/source/blender/bmesh/operators/bmo_bisect_plane.c
@@ -38,7 +38,8 @@
 #include "intern/bmesh_operators_private.h" /* own include */
 
 #define ELE_NEW 1
-#define ELE_INPUT 2
+#define ELE_CUT 2
+#define ELE_INPUT 4
 
 void bmo_bisect_plane_exec(BMesh *bm, BMOperator *op)
 {
@@ -69,7 +70,7 @@ void bmo_bisect_plane_exec(BMesh *bm, BMOperator *op)
 
 
 	BM_mesh_bisect_plane(bm, plane, use_snap_center, true,
-	                     ELE_NEW, dist);
+	                     ELE_CUT, ELE_NEW, dist);
 
 
 	if (clear_outer || clear_inner) {
@@ -108,5 +109,5 @@ void bmo_bisect_plane_exec(BMesh *bm, BMOperator *op)
 	}
 
 	BMO_slot_buffer_from_enabled_flag(bm, op, op->slots_out, "geom.out", BM_ALL_NOLOOP, ELE_NEW | ELE_INPUT);
-	BMO_slot_buffer_from_enabled_flag(bm, op, op->slots_out, "geom_cut.out", BM_VERT | BM_EDGE, ELE_NEW);
+	BMO_slot_buffer_from_enabled_flag(bm, op, op->slots_out, "geom_cut.out", BM_VERT | BM_EDGE, ELE_CUT);
 }
diff --git a/source/blender/bmesh/operators/bmo_dupe.c b/source/blender/bmesh/operators/bmo_dupe.c
index 56639a097b6..8048add84d4 100644
--- a/source/blender/bmesh/operators/bmo_dupe.c
+++ b/source/blender/bmesh/operators/bmo_dupe.c
@@ -378,6 +378,10 @@ void BMO_dupe_from_flag(BMesh *bm, int htype, const char hflag)
  * BMOP_DUPE_VOUTPUT: Buffer containing pointers to the split mesh vertices
  * BMOP_DUPE_EOUTPUT: Buffer containing pointers to the split mesh edges
  * BMOP_DUPE_FOUTPUT: Buffer containing pointers to the split mesh faces
+ *
+ * \note Lower level uses of this operator may want to use #BM_mesh_separate_faces
+ * Since it's faster for the 'use_only_faces' case.
+ *
  */
 void bmo_split_exec(BMesh *bm, BMOperator *op)
 {
diff --git a/source/blender/bmesh/operators/bmo_inset.c b/source/blender/bmesh/operators/bmo_inset.c
index e2ff09669d7..f0796e425fa 100644
--- a/source/blender/bmesh/operators/bmo_inset.c
+++ b/source/blender/bmesh/operators/bmo_inset.c
@@ -663,7 +663,7 @@ void bmo_inset_region_exec(BMesh *bm, BMOperator *op)
 
 		/* run the separate arg */
 		if (!BM_edge_is_boundary(es->e_old)) {
-			bmesh_edge_separate(bm, es->e_old, es->l, false);
+			bmesh_kernel_edge_separate(bm, es->e_old, es->l, false);
 		}
 
 		/* calc edge-split info */
@@ -742,7 +742,7 @@ void bmo_inset_region_exec(BMesh *bm, BMOperator *op)
 				/* disable touching twice, this _will_ happen if the flags not disabled */
 				BM_elem_flag_disable(v, BM_ELEM_TAG);
 
-				bmesh_vert_separate(bm, v, &vout, &r_vout_len, false);
+				bmesh_kernel_vert_separate(bm, v, &vout, &r_vout_len, false);
 				v = NULL; /* don't use again */
 
 				/* in some cases the edge doesn't split off */
diff --git a/source/blender/bmesh/operators/bmo_offset_edgeloops.c b/source/blender/bmesh/operators/bmo_offset_edgeloops.c
index 7a6f779b34f..a9840a72fc9 100644
--- a/source/blender/bmesh/operators/bmo_offset_edgeloops.c
+++ b/source/blender/bmesh/operators/bmo_offset_edgeloops.c
@@ -270,7 +270,7 @@ void bmo_offset_edgeloops_exec(BMesh *bm, BMOperator *op)
 				v_other = BM_edge_other_vert(e, v);
 				if (BM_elem_index_get(v_other) == -1) {
 					if (BM_vert_is_edge_pair(v_other)) {
-						/* defer bmesh_jekv to avoid looping over data we're removing */
+						/* defer bmesh_kernel_join_edge_kill_vert to avoid looping over data we're removing */
 						v_other->e = e;
 						STACK_PUSH(varr, v_other);
 					}
@@ -278,7 +278,7 @@ void bmo_offset_edgeloops_exec(BMesh *bm, BMOperator *op)
 			}
 
 			while ((v = STACK_POP(varr))) {
-				bmesh_jekv(bm, v->e, v, true, false, false);
+				bmesh_kernel_join_edge_kill_vert(bm, v->e, v, true, false, false);
 			}
 		}
 	}
diff --git a/source/blender/bmesh/operators/bmo_primitive.c b/source/blender/bmesh/operators/bmo_primitive.c
index 8408169d85e..723e0b168e0 100644
--- a/source/blender/bmesh/operators/bmo_primitive.c
+++ b/source/blender/bmesh/operators/bmo_primitive.c
@@ -1122,7 +1122,7 @@ static void bm_mesh_calc_uvs_sphere_face(BMFace *f, const int cd_loop_uv_offset)
 		}
 
 		/* Shift borderline coordinates to the left. */
-		if (fabsf(theta - M_PI) < 0.0001f) {
+		if (fabsf(theta - (float)M_PI) < 0.0001f) {
 			theta = -M_PI;
 		}
 
diff --git a/source/blender/bmesh/tools/bmesh_bisect_plane.c b/source/blender/bmesh/tools/bmesh_bisect_plane.c
index 51b92a3c45e..828c50c39fd 100644
--- a/source/blender/bmesh/tools/bmesh_bisect_plane.c
+++ b/source/blender/bmesh/tools/bmesh_bisect_plane.c
@@ -110,7 +110,7 @@ static int bm_vert_sortval_cb(const void *v_a_v, const void *v_b_v)
 }
 
 
-static void bm_face_bisect_verts(BMesh *bm, BMFace *f, const float plane[4], const short oflag_center)
+static void bm_face_bisect_verts(BMesh *bm, BMFace *f, const float plane[4], const short oflag_center, const short oflag_new)
 {
 	/* unlikely more than 2 verts are needed */
 	const unsigned int f_len_orig = (unsigned int)f->len;
@@ -154,10 +154,11 @@ static void bm_face_bisect_verts(BMesh *bm, BMFace *f, const float plane[4], con
 			/* common case, just cut the face once */
 			BM_face_split(bm, f, l_a, l_b, &l_new, NULL, true);
 			if (l_new) {
-				if (oflag_center) {
-					BMO_edge_flag_enable(bm, l_new->e, oflag_center);
-					BMO_face_flag_enable(bm, l_new->f, oflag_center);
-					BMO_face_flag_enable(bm, f,        oflag_center);
+				if (oflag_center | oflag_new) {
+					BMO_edge_flag_enable(bm, l_new->e, oflag_center | oflag_new);
+				}
+				if (oflag_new) {
+					BMO_face_flag_enable(bm, l_new->f, oflag_new);
 				}
 			}
 		}
@@ -269,10 +270,11 @@ static void bm_face_bisect_verts(BMesh *bm, BMFace *f, const float plane[4], con
 						f_tmp = BM_face_split(bm, face_split_arr[j], l_a, l_b, &l_new, NULL, true);
 
 						if (l_new) {
-							if (oflag_center) {
-								BMO_edge_flag_enable(bm, l_new->e,          oflag_center);
-								BMO_face_flag_enable(bm, l_new->f,          oflag_center);
-								BMO_face_flag_enable(bm, face_split_arr[j], oflag_center);
+							if (oflag_center | oflag_new) {
+								BMO_edge_flag_enable(bm, l_new->e, oflag_center | oflag_new);
+							}
+							if (oflag_new) {
+								BMO_face_flag_enable(bm, l_new->f, oflag_new);
 							}
 						}
 
@@ -307,7 +309,7 @@ finally:
 void BM_mesh_bisect_plane(
         BMesh *bm, const float plane[4],
         const bool use_snap_center, const bool use_tag,
-        const short oflag_center, const float eps)
+        const short oflag_center, const short oflag_new, const float eps)
 {
 	unsigned int einput_len;
 	unsigned int i;
@@ -390,7 +392,7 @@ void BM_mesh_bisect_plane(
 		const float dist[2] = {BM_VERT_DIST(e->v1), BM_VERT_DIST(e->v2)};
 
 		if (side[0] && side[1] && (side[0] != side[1])) {
-			const float e_fac = fabsf(dist[0]) / fabsf(dist[0] - dist[1]);
+			const float e_fac = dist[0] / (dist[0] - dist[1]);
 			BMVert *v_new;
 
 			if (e->l) {
@@ -404,10 +406,17 @@ void BM_mesh_bisect_plane(
 				} while ((l_iter = l_iter->radial_next) != l_first);
 			}
 
-			v_new = BM_edge_split(bm, e, e->v1, NULL, e_fac);
+			{
+				BMEdge *e_new;
+				v_new = BM_edge_split(bm, e, e->v1, &e_new, e_fac);
+				if (oflag_new) {
+					BMO_edge_flag_enable(bm, e_new, oflag_new);
+				}
+			}
+
 			vert_is_center_enable(v_new);
-			if (oflag_center) {
-				BMO_vert_flag_enable(bm, v_new, oflag_center);
+			if (oflag_new | oflag_center) {
+				BMO_vert_flag_enable(bm, v_new, oflag_new | oflag_center);
 			}
 
 			BM_VERT_DIR(v_new) = 0;
@@ -448,7 +457,7 @@ void BM_mesh_bisect_plane(
 	MEM_freeN(edges_arr);
 
 	while ((f = BLI_LINKSTACK_POP(face_stack))) {
-		bm_face_bisect_verts(bm, f, plane, oflag_center);
+		bm_face_bisect_verts(bm, f, plane, oflag_center, oflag_new);
 	}
 
 	/* now we have all faces to split in the stack */
diff --git a/source/blender/bmesh/tools/bmesh_bisect_plane.h b/source/blender/bmesh/tools/bmesh_bisect_plane.h
index 7f3a97c4c79..fb99a1c8214 100644
--- a/source/blender/bmesh/tools/bmesh_bisect_plane.h
+++ b/source/blender/bmesh/tools/bmesh_bisect_plane.h
@@ -30,6 +30,6 @@
 void BM_mesh_bisect_plane(
         BMesh *bm, const float plane[4],
         const bool use_snap_center, const bool use_tag,
-        const short oflag_center, const float eps);
+        const short oflag_center, const short oflag_new, const float eps);
 
 #endif /* __BMESH_BISECT_PLANE_H__ */
diff --git a/source/blender/bmesh/tools/bmesh_intersect.c b/source/blender/bmesh/tools/bmesh_intersect.c
index 58234ddf3bd..2cb82d0fc02 100644
--- a/source/blender/bmesh/tools/bmesh_intersect.c
+++ b/source/blender/bmesh/tools/bmesh_intersect.c
@@ -986,7 +986,7 @@ bool BM_mesh_intersect(
         struct BMLoop *(*looptris)[3], const int looptris_tot,
         int (*test_fn)(BMFace *f, void *user_data), void *user_data,
         const bool use_self, const bool use_separate, const bool use_dissolve, const bool use_island_connect,
-        const int boolean_mode,
+        const bool use_edge_tag, const int boolean_mode,
         const float eps)
 {
 	struct ISectState s;
@@ -1526,7 +1526,7 @@ bool BM_mesh_intersect(
 
 		BM_mesh_edgesplit(bm, false, true, false);
 	}
-	else if (boolean_mode != BMESH_ISECT_BOOLEAN_NONE) {
+	else if (boolean_mode != BMESH_ISECT_BOOLEAN_NONE || use_edge_tag) {
 		GSetIterator gs_iter;
 
 		/* no need to clear for boolean */
diff --git a/source/blender/bmesh/tools/bmesh_intersect.h b/source/blender/bmesh/tools/bmesh_intersect.h
index d0cc41654eb..51926a01710 100644
--- a/source/blender/bmesh/tools/bmesh_intersect.h
+++ b/source/blender/bmesh/tools/bmesh_intersect.h
@@ -30,7 +30,7 @@ bool BM_mesh_intersect(
         struct BMLoop *(*looptris)[3], const int looptris_tot,
         int (*test_fn)(BMFace *f, void *user_data), void *user_data,
         const bool use_self, const bool use_separate, const bool use_dissolve, const bool use_island_connect,
-        const int boolean_mode,
+        const bool use_edge_tag, const int boolean_mode,
         const float eps);
 
 enum {
diff --git a/source/blender/bmesh/tools/bmesh_separate.c b/source/blender/bmesh/tools/bmesh_separate.c
new file mode 100644
index 00000000000..ff28ed4c156
--- /dev/null
+++ b/source/blender/bmesh/tools/bmesh_separate.c
@@ -0,0 +1,133 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/bmesh/tools/bmesh_separate.c
+ *  \ingroup bmesh
+ *
+ * BMesh separate, disconnects a set of faces from all others,
+ * so they don't share any vertices/edges with other faces.
+ */
+
+#include <limits.h>
+
+#include "MEM_guardedalloc.h"
+
+#include "BLI_utildefines.h"
+#include "BLI_buffer.h"
+
+#include "bmesh.h"
+#include "intern/bmesh_private.h"
+#include "bmesh_separate.h"  /* own include */
+
+/**
+ * Split all faces that match `filter_fn`.
+ * \note
+ */
+void BM_mesh_separate_faces(
+        BMesh *bm,
+        BMFaceFilterFunc filter_fn, void *user_data)
+{
+	BMFace **faces_array_all = MEM_mallocN(bm->totface * sizeof(BMFace *), __func__);
+	/*
+	 * - Create an array of faces based on 'filter_fn'.
+	 *   First part of array for match, for non-match.
+	 *
+	 * - Enable all vertex tags, then clear all tagged vertices from 'faces_b'.
+	 *
+	 * - Loop over 'faces_a', checking each vertex,
+	 *   splitting out any which aren't tagged (and therefor shared), disabling tags as we go.
+	 */
+
+	BMFace *f;
+	BMIter iter;
+
+	unsigned int faces_a_len = 0;
+	unsigned int faces_b_len = 0;
+	{
+		int i_a = 0;
+		int i_b = bm->totface;
+		BM_ITER_MESH (f, &iter, bm, BM_FACES_OF_MESH) {
+			faces_array_all[filter_fn(f, user_data) ? i_a++ : --i_b] = f;
+		}
+		faces_a_len = i_a;
+		faces_b_len = bm->totface - i_a;
+	}
+
+	BMFace **faces_a = faces_array_all;
+	BMFace **faces_b = faces_array_all + faces_a_len;
+
+	/* Enable for all  */
+	BM_mesh_elem_hflag_enable_all(bm, BM_VERT, BM_ELEM_TAG, false);
+
+	/* Disable vert tag on faces_b */
+	for (unsigned int i = 0; i < faces_b_len; i++) {
+		BMLoop *l_iter, *l_first;
+		l_iter = l_first = BM_FACE_FIRST_LOOP(faces_b[i]);
+		do {
+			BM_elem_flag_disable(l_iter->v, BM_ELEM_TAG);
+		} while ((l_iter = l_iter->next) != l_first);
+	}
+
+
+	BLI_buffer_declare_static(BMLoop **, loop_split, 0, 128);
+
+	/* Check shared verts ('faces_a' tag and disable) */
+	for (unsigned int i = 0; i < faces_a_len; i++) {
+		BMLoop *l_iter, *l_first;
+		l_iter = l_first = BM_FACE_FIRST_LOOP(faces_a[i]);
+		do {
+			if (!BM_elem_flag_test(l_iter->v, BM_ELEM_TAG)) {
+				BMVert *v = l_iter->v;
+				/* Enable, since we may visit this vertex again on other faces */
+				BM_elem_flag_enable(v, BM_ELEM_TAG);
+
+				/* We know the vertex is shared, collect all vertices and split them off. */
+
+				/* Fill 'loop_split' */
+				{
+					BMEdge *e_first, *e_iter;
+					e_iter = e_first = l_iter->e;
+					do {
+						if (e_iter->l != NULL) {
+							BMLoop *l_radial_first, *l_radial_iter;
+							l_radial_first = l_radial_iter = e_iter->l;
+							do {
+								if (l_radial_iter->v == v) {
+									if (filter_fn(l_radial_iter->f, user_data)) {
+										BLI_buffer_append(&loop_split, BMLoop *, l_radial_iter);
+									}
+								}
+							} while ((l_radial_iter = l_radial_iter->radial_next) != l_radial_first);
+						}
+					} while ((e_iter = bmesh_disk_edge_next(e_iter, v)) != e_first);
+				}
+
+				/* Perform the split */
+				BM_face_loop_separate_multi(bm, loop_split.data, loop_split.count);
+
+				BLI_buffer_empty(&loop_split);
+			}
+		} while ((l_iter = l_iter->next) != l_first);
+	}
+
+	BLI_buffer_free(&loop_split);
+
+	MEM_freeN(faces_array_all);
+}
diff --git a/source/blender/bmesh/tools/bmesh_separate.h b/source/blender/bmesh/tools/bmesh_separate.h
new file mode 100644
index 00000000000..91b2b71c872
--- /dev/null
+++ b/source/blender/bmesh/tools/bmesh_separate.h
@@ -0,0 +1,32 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+#ifndef __BMESH_SEPARATE_H__
+#define __BMESH_SEPARATE_H__
+
+/** \file blender/bmesh/tools/bmesh_separate.h
+ *  \ingroup bmesh
+ */
+
+void BM_mesh_separate_faces(
+        BMesh *bm,
+        BMFaceFilterFunc filter_fn, void *user_data);
+
+#endif /* __BMESH_SEPARATE_H__ */
diff --git a/source/blender/collada/AnimationImporter.cpp b/source/blender/collada/AnimationImporter.cpp
index 3801c9300df..bd47ee0214d 100644
--- a/source/blender/collada/AnimationImporter.cpp
+++ b/source/blender/collada/AnimationImporter.cpp
@@ -1934,7 +1934,7 @@ Object *AnimationImporter::get_joint_object(COLLADAFW::Node *root, COLLADAFW::No
 			mul_m4_m4m4(mat, ipar, temp);
 		}
 
-		TransformBase::decompose(mat, job->loc, NULL, job->quat, job->size);
+		bc_decompose(mat, job->loc, NULL, job->quat, job->size);
 
 		if (par_job) {
 			job->parent = par_job;
diff --git a/source/blender/collada/ArmatureExporter.cpp b/source/blender/collada/ArmatureExporter.cpp
index 9c26ba83b44..49722873a91 100644
--- a/source/blender/collada/ArmatureExporter.cpp
+++ b/source/blender/collada/ArmatureExporter.cpp
@@ -156,11 +156,6 @@ void ArmatureExporter::find_objects_using_armature(Object *ob_arm, std::vector<O
 }
 #endif
 
-std::string ArmatureExporter::get_joint_sid(Bone *bone, Object *ob_arm)
-{
-	return get_joint_id(bone, ob_arm);
-}
-
 // parent_mat is armature-space
 void ArmatureExporter::add_bone_node(Bone *bone, Object *ob_arm, Scene *sce,
                                      SceneExporter *se,
@@ -261,47 +256,57 @@ void ArmatureExporter::add_bone_transform(Object *ob_arm, Bone *bone, COLLADASW:
 	//bPoseChannel *pchan = BKE_pose_channel_find_name(ob_arm->pose, bone->name);
 
 	float mat[4][4];
+	float bone_rest_mat[4][4]; /* derived from bone->arm_mat */
+	float parent_rest_mat[4][4]; /* derived from bone->parent->arm_mat */
 
-	if (bone->parent) {
-		// get bone-space matrix from parent pose
-		/*bPoseChannel *parchan = BKE_pose_channel_find_name(ob_arm->pose, bone->parent->name);
-		float invpar[4][4];
-		invert_m4_m4(invpar, parchan->pose_mat);
-		mul_m4_m4m4(mat, invpar, pchan->pose_mat);*/
-		
-		float invpar[4][4];
-		invert_m4_m4(invpar, bone->parent->arm_mat);
-		mul_m4_m4m4(mat, invpar, bone->arm_mat);
+	bool has_restmat = bc_get_property_matrix(bone, "rest_mat", mat);
 
-	}
-	else {
+	if (!has_restmat) {
+
+		/* Have no restpose matrix stored, try old style <= Blender 2.78 */
 		
-		//copy_m4_m4(mat, pchan->pose_mat);
-		//pose mat is object space
-		//New change: export bone->arm_mat
-		copy_m4_m4(mat, bone->arm_mat);
-	}
+		bc_create_restpose_mat(this->export_settings, bone, bone_rest_mat, bone->arm_mat, true);
 
-	// OPEN_SIM_COMPATIBILITY
-	if (export_settings->open_sim) {
-		// Remove rotations vs armature from transform
-		// parent_rest_rot * mat * irest_rot
-		float temp[4][4];
-		copy_m4_m4(temp, bone->arm_mat);
-		temp[3][0] = temp[3][1] = temp[3][2] = 0.0f;
-		invert_m4(temp);
+		if (bone->parent) {
+			// get bone-space matrix from parent pose
+			/*bPoseChannel *parchan = BKE_pose_channel_find_name(ob_arm->pose, bone->parent->name);
+			float invpar[4][4];
+			invert_m4_m4(invpar, parchan->pose_mat);
+			mul_m4_m4m4(mat, invpar, pchan->pose_mat);*/
+			float invpar[4][4];
+			bc_create_restpose_mat(this->export_settings, bone->parent, parent_rest_mat, bone->parent->arm_mat, true);
 
-		mul_m4_m4m4(mat, mat, temp);
+			invert_m4_m4(invpar, parent_rest_mat);
+			mul_m4_m4m4(mat, invpar, bone_rest_mat);
 
-		if (bone->parent) {
-			copy_m4_m4(temp, bone->parent->arm_mat);
+		}
+		else {
+			copy_m4_m4(mat, bone_rest_mat);
+		}
+
+		// OPEN_SIM_COMPATIBILITY
+		if (export_settings->open_sim) {
+			// Remove rotations vs armature from transform
+			// parent_rest_rot * mat * irest_rot
+			float temp[4][4];
+			copy_m4_m4(temp, bone_rest_mat);
 			temp[3][0] = temp[3][1] = temp[3][2] = 0.0f;
+			invert_m4(temp);
+
+			mul_m4_m4m4(mat, mat, temp);
 
-			mul_m4_m4m4(mat, temp, mat);
+			if (bone->parent) {
+				copy_m4_m4(temp, parent_rest_mat);
+				temp[3][0] = temp[3][1] = temp[3][2] = 0.0f;
+
+				mul_m4_m4m4(mat, temp, mat);
+			}
 		}
 	}
 
+	bc_sanitize_mat(mat, 6); // XXX: Make this optional ?
 	TransformWriter::add_node_transform(node, mat, NULL);
+
 }
 
 std::string ArmatureExporter::get_controller_id(Object *ob_arm, Object *ob)
diff --git a/source/blender/collada/ArmatureExporter.h b/source/blender/collada/ArmatureExporter.h
index 883a6aca847..d271b505aa9 100644
--- a/source/blender/collada/ArmatureExporter.h
+++ b/source/blender/collada/ArmatureExporter.h
@@ -83,8 +83,6 @@ private:
 	void find_objects_using_armature(Object *ob_arm, std::vector<Object *>& objects, Scene *sce);
 #endif
 
-	std::string get_joint_sid(Bone *bone, Object *ob_arm);
-
 	// Scene, SceneExporter and the list of child_objects
 	// are required for writing bone parented objects
 	void add_bone_node(Bone *bone, Object *ob_arm, Scene *sce, SceneExporter *se,
diff --git a/source/blender/collada/ArmatureImporter.cpp b/source/blender/collada/ArmatureImporter.cpp
index 17334ca326c..0ea8324ed7c 100644
--- a/source/blender/collada/ArmatureImporter.cpp
+++ b/source/blender/collada/ArmatureImporter.cpp
@@ -32,6 +32,7 @@
 
 #include "COLLADAFWUniqueId.h"
 
+extern "C" {
 #include "BKE_action.h"
 #include "BKE_depsgraph.h"
 #include "BKE_object.h"
@@ -39,7 +40,9 @@
 #include "BLI_string.h"
 #include "BLI_listbase.h"
 #include "ED_armature.h"
+}
 
+#include "collada_utils.h"
 #include "ArmatureImporter.h"
 
 // use node name, or fall back to original id if not present (name is optional)
@@ -91,6 +94,7 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 {
 	float mat[4][4];
 	float joint_inv_bind_mat[4][4];
+	float joint_bind_mat[4][4];
 	int chain_length = 0;
 
 	//Checking if bone is already made.
@@ -106,7 +110,7 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 	*/
 
 	std::map<COLLADAFW::UniqueId, SkinInfo>::iterator skin_it;
-	bool bone_is_not_skinned = true;
+	bool bone_is_skinned = false;
 	for (skin_it = skin_by_data_uid.begin(); skin_it != skin_by_data_uid.end(); skin_it++) {
 
 		SkinInfo *b = &skin_it->second;
@@ -114,7 +118,7 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 
 			// get original world-space matrix
 			invert_m4_m4(mat, joint_inv_bind_mat);
-
+			copy_m4_m4(joint_bind_mat, mat);
 			// And make local to armature
 			Object *ob_arm = skin->BKE_armature_from_object();
 			if (ob_arm) {
@@ -123,24 +127,14 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 				mul_m4_m4m4(mat, invmat, mat);
 			}
 
-			bone_is_not_skinned = false;
+			bone_is_skinned = true;
 			break;
 		}
 	}
 
 	// create a bone even if there's no joint data for it (i.e. it has no influence)
-	if (bone_is_not_skinned) {
-		float obmat[4][4];
-		// bone-space
-		get_node_mat(obmat, node, NULL, NULL);
-
-		// get world-space
-		if (parent) {
-			mul_m4_m4m4(mat, parent_mat, obmat);
-		}
-		else {
-			copy_m4_m4(mat, obmat);
-		}
+	if (!bone_is_skinned) {
+		get_node_mat(mat, node, NULL, NULL, parent_mat);
 	}
 
 	if (parent) bone->parent = parent;
@@ -156,10 +150,11 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 	int use_connect = be.get_use_connect();
 
 	switch (use_connect) {
-	case 1:  bone->flag |= BONE_CONNECTED;
-			 break;
-	case 0:  bone->flag &= ~BONE_CONNECTED;
-	case -1: break; // not defined
+		case 1: bone->flag |= BONE_CONNECTED;
+			break;
+		case -1:/* Connect type not specified */
+		case 0: bone->flag &= ~BONE_CONNECTED;
+			break;
 	}
 
 	if (be.has_roll()) {
@@ -172,6 +167,15 @@ int ArmatureImporter::create_bone(SkinInfo *skin, COLLADAFW::Node *node, EditBon
 		bone->roll = angle;
 	}
 	copy_v3_v3(bone->head, mat[3]);
+
+	if (bone_is_skinned)
+	{
+		float rest_mat[4][4];
+		get_node_mat(rest_mat, node, NULL, NULL, NULL);
+		bc_set_IDPropertyMatrix(bone, "bind_mat", joint_bind_mat);
+		bc_set_IDPropertyMatrix(bone, "rest_mat", rest_mat);
+	}
+
 	add_v3_v3v3(bone->tail, bone->head, tail); //tail must be non zero
 
 	/* find smallest bone length in armature (used later for leaf bone length) */
@@ -273,7 +277,6 @@ void ArmatureImporter::fix_parent_connect(bArmature *armature, Bone *bone)
 
 }
 
-
 void ArmatureImporter::connect_bone_chains(bArmature *armature, Bone *parentbone, int clip)
 {
 	BoneExtensionMap &extended_bones = bone_extension_manager.getExtensionMap(armature);
@@ -289,12 +292,13 @@ void ArmatureImporter::connect_bone_chains(bArmature *armature, Bone *parentbone
 		for (; child; child = child->next) {
 			BoneExtended *be = extended_bones[child->name];
 			if (be != NULL) {
-				if (be->get_chain_length() <= clip) {
-					if (be->get_chain_length() > maxlen) {
+				int chain_len = be->get_chain_length();
+				if (chain_len <= clip) {
+					if (chain_len > maxlen) {
 						dominant_child = be;
-						maxlen = be->get_chain_length();
+						maxlen = chain_len;
 					}
-					else if (be->get_chain_length() == maxlen) {
+					else if (chain_len == maxlen) {
 						dominant_child = NULL;
 					}
 				}
@@ -308,7 +312,6 @@ void ArmatureImporter::connect_bone_chains(bArmature *armature, Bone *parentbone
 		EditBone *pebone = bc_get_edit_bone(armature, parentbone->name);
 		EditBone *cebone = bc_get_edit_bone(armature, dominant_child->get_name());
 		if (pebone && !(cebone->flag & BONE_CONNECTED)) {
-
 			float vec[3];
 			sub_v3_v3v3(vec, cebone->head, pebone->head);
 
@@ -321,14 +324,16 @@ void ArmatureImporter::connect_bone_chains(bArmature *armature, Bone *parentbone
 
 			if (len_squared_v3(vec) > MINIMUM_BONE_LENGTH)
 			{
-				pebone->tail[0] = cebone->head[0];
-				pebone->tail[1] = cebone->head[1];
-				pebone->tail[2] = cebone->head[2];
-
+				copy_v3_v3(pebone->tail, cebone->head);
+				pbe->set_tail(pebone->tail); /* to make fix_leafbone happy ...*/
 				if (pbe && pbe->get_chain_length() >= this->import_settings->min_chain_length) {
+
+					BoneExtended *cbe = extended_bones[cebone->name];
+					cbe->set_use_connect(true);
+
 					cebone->flag |= BONE_CONNECTED;
-					printf("Connecting chain: parent %s --> %s (child)\n", pebone->name, cebone->name);
 					pbe->set_leaf_bone(false);
+					printf("Connect Bone chain: parent (%s --> %s) child)\n", pebone->name, cebone->name);
 				}
 			}
 		}
diff --git a/source/blender/collada/ControllerExporter.cpp b/source/blender/collada/ControllerExporter.cpp
index 06e151c363b..5444c9dfa6b 100644
--- a/source/blender/collada/ControllerExporter.cpp
+++ b/source/blender/collada/ControllerExporter.cpp
@@ -157,11 +157,6 @@ void ArmatureExporter::find_objects_using_armature(Object *ob_arm, std::vector<O
 }
 #endif
 
-std::string ControllerExporter::get_joint_sid(Bone *bone, Object *ob_arm)
-{
-	return get_joint_id(bone, ob_arm);
-}
-
 std::string ControllerExporter::get_controller_id(Object *ob_arm, Object *ob)
 {
 	return translate_id(id_name(ob_arm)) + "_" + translate_id(id_name(ob)) + SKIN_CONTROLLER_ID_SUFFIX;
@@ -468,81 +463,6 @@ std::string ControllerExporter::add_joints_source(Object *ob_arm, ListBase *defb
 	return source_id;
 }
 
-static float get_property(Bone *bone, const char *key, float def)
-{
-	float result = def;
-	if (bone->prop) {
-		IDProperty *property = IDP_GetPropertyFromGroup(bone->prop, key);
-		if (property) {
-			switch (property->type) {
-				case IDP_INT:
-					result = (float)(IDP_Int(property));
-					break;
-				case IDP_FLOAT:
-					result = (float)(IDP_Float(property));
-					break;
-				case IDP_DOUBLE:
-					result = (float)(IDP_Double(property));
-					break;
-				default:
-					result = def;
-			}
-		}
-	}
-	return result;
-}
-
-/**
- * This function creates an arbitrary rest pose matrix from
- * data provided as custom properties. This is a workaround
- * for support of maya's restpose matrix which can be arbitrary
- * in opposition to Blender where the Rest pose Matrix is always
- * the Identity matrix.
- *
- * The custom properties are:
- *
- * restpose_scale_x
- * restpose_scale_y
- * restpose_scale_z
- *
- * restpose_rot_x
- * restpose_rot_y
- * restpose_rot_z
- *
- * restpose_loc_x
- * restpose_loc_y
- * restpose_loc_z
- *
- * The matrix is only setup if the scale AND the rot properties are defined.
- * The presence of the loc properties is optional.
- *
- * This feature has been implemented to support Second Life "Fitted Mesh"
- * TODO: Check if an arbitrary rest pose matrix makes sense within Blender.
- * Eventually leverage the custom property data into an "official" 
- * Edit_bone Property
- */
-static void create_restpose_mat(Bone *bone, float mat[4][4])
-{
-	float loc[3] = {
-		get_property(bone, "restpose_loc_x", 0.0),
-		get_property(bone, "restpose_loc_y", 0.0),
-		get_property(bone, "restpose_loc_z", 0.0)
-	};
-
-	float rot[3] = {
-		DEG2RADF(get_property(bone, "restpose_rot_x", 0.0)),
-		DEG2RADF(get_property(bone, "restpose_rot_y", 0.0)),
-		DEG2RADF(get_property(bone, "restpose_rot_z", 0.0))
-	};
-
-	float scale[3] = {
-		get_property(bone, "restpose_scale_x", 1.0),
-		get_property(bone, "restpose_scale_y", 1.0),
-		get_property(bone, "restpose_scale_z", 1.0)
-	};
-
-	loc_eulO_size_to_mat4(mat, loc, rot, scale, 6);
-}
 
 std::string ControllerExporter::add_inv_bind_mats_source(Object *ob_arm, ListBase *defbase, const std::string& controller_id)
 {
@@ -585,30 +505,35 @@ std::string ControllerExporter::add_inv_bind_mats_source(Object *ob_arm, ListBas
 			float world[4][4];
 			float inv_bind_mat[4][4];
 
+			float bind_mat[4][4]; /* derived from bone->arm_mat */
+
+			bool has_bindmat = bc_get_property_matrix(pchan->bone, "bind_mat", bind_mat);
 			
-			// SL/OPEN_SIM COMPATIBILITY
-			if (export_settings->open_sim) {
-				// Only translations, no rotation vs armature
-				float temp[4][4];
-				unit_m4(temp);
-				copy_v3_v3(temp[3], pchan->bone->arm_mat[3]);
-				mul_m4_m4m4(world, ob_arm->obmat, temp);
-
-				// Add Maya restpose matrix (if defined as properties)
-				float restpose_mat[4][4];
-				create_restpose_mat(pchan->bone, restpose_mat);
-				mul_m4_m4m4(world, world, restpose_mat);
+			if (!has_bindmat) {
 
-			}
-			else {
-				// make world-space matrix, arm_mat is armature-space
-				mul_m4_m4m4(world, ob_arm->obmat, pchan->bone->arm_mat);
+				/* Have no bind matrix stored, try old style <= Blender 2.78 */
+
+				bc_create_restpose_mat(this->export_settings, pchan->bone, bind_mat, pchan->bone->arm_mat, true);
+
+				// SL/OPEN_SIM COMPATIBILITY
+				if (export_settings->open_sim) {
+
+					float loc[3];
+					float rot[3] = { 0, 0, 0 };
+					float scale[3];
+					bc_decompose(bind_mat, loc, NULL, NULL, scale);
+
+					// Only translations, no rotation vs armature
+					loc_eulO_size_to_mat4(bind_mat, loc, rot, scale, 6);
+				}
 			}
 
+			// make world-space matrix (bind_mat is armature-space)
+			mul_m4_m4m4(world, ob_arm->obmat, bind_mat);
 
 			invert_m4_m4(mat, world);
 			converter.mat4_to_dae(inv_bind_mat, mat);
-
+			bc_sanitize_mat(inv_bind_mat, 6); // XXX: Make this optional ?
 			source.appendValues(inv_bind_mat);
 		}
 	}
diff --git a/source/blender/collada/ControllerExporter.h b/source/blender/collada/ControllerExporter.h
index 0be51187f6f..80b858ca6dd 100644
--- a/source/blender/collada/ControllerExporter.h
+++ b/source/blender/collada/ControllerExporter.h
@@ -84,8 +84,6 @@ private:
 	void find_objects_using_armature(Object *ob_arm, std::vector<Object *>& objects, Scene *sce);
 #endif
 
-	std::string get_joint_sid(Bone *bone, Object *ob_arm);
-
 	std::string get_controller_id(Object *ob_arm, Object *ob);
 
 	std::string get_controller_id(Key *key, Object *ob);
diff --git a/source/blender/collada/ExportSettings.h b/source/blender/collada/ExportSettings.h
index 9451cac9dae..03e380dc198 100644
--- a/source/blender/collada/ExportSettings.h
+++ b/source/blender/collada/ExportSettings.h
@@ -51,7 +51,9 @@ public:
 	bool use_blender_profile;
 	bool sort_by_name;
 	BC_export_transformation_type export_transformation_type;
+
 	bool open_sim;
+	bool keep_bind_info;
 
 	char *filepath;
 	LinkNode *export_set;
diff --git a/source/blender/collada/ImportSettings.h b/source/blender/collada/ImportSettings.h
index 2c52d73e756..4a2d4e8046a 100644
--- a/source/blender/collada/ImportSettings.h
+++ b/source/blender/collada/ImportSettings.h
@@ -37,6 +37,7 @@ public:
 	bool fix_orientation;
 	int  min_chain_length;
 	char *filepath;
+	bool keep_bind_info;
 };
 
 #endif
diff --git a/source/blender/collada/SkinInfo.cpp b/source/blender/collada/SkinInfo.cpp
index 7242a24523c..71875d6274a 100644
--- a/source/blender/collada/SkinInfo.cpp
+++ b/source/blender/collada/SkinInfo.cpp
@@ -230,7 +230,6 @@ void SkinInfo::link_armature(bContext *C, Object *ob, std::map<COLLADAFW::Unique
 	ModifierData *md = ED_object_modifier_add(NULL, bmain, scene, ob, NULL, eModifierType_Armature);
 	ArmatureModifierData *amd = (ArmatureModifierData *)md;
 	amd->object = ob_arm;
-	struct bArmature *armature = (bArmature *)ob_arm->data;
 
 #if 1
 	bc_set_parent(ob, ob_arm, C);
diff --git a/source/blender/collada/TransformReader.cpp b/source/blender/collada/TransformReader.cpp
index f8f31304d28..7f742be7e30 100644
--- a/source/blender/collada/TransformReader.cpp
+++ b/source/blender/collada/TransformReader.cpp
@@ -34,7 +34,21 @@ TransformReader::TransformReader(UnitConverter *conv) : unit_converter(conv)
 	/* pass */
 }
 
-void TransformReader::get_node_mat(float mat[4][4], COLLADAFW::Node *node, std::map<COLLADAFW::UniqueId, Animation> *animation_map, Object *ob)
+void TransformReader::get_node_mat(
+	float mat[4][4],
+	COLLADAFW::Node *node,
+	std::map<COLLADAFW::UniqueId, Animation> *animation_map,
+	Object *ob)
+{
+	get_node_mat(mat, node, animation_map, ob, NULL);
+}
+
+void TransformReader::get_node_mat(
+	float mat[4][4],
+	COLLADAFW::Node *node,
+	std::map<COLLADAFW::UniqueId, Animation> *animation_map,
+	Object *ob,
+	float parent_mat[4][4])
 {
 	float cur[4][4];
 	float copy[4][4];
@@ -52,6 +66,9 @@ void TransformReader::get_node_mat(float mat[4][4], COLLADAFW::Node *node, std::
 				// then this is considered as redundant information.
 				// So if we find a Matrix we use that and return.
 				dae_matrix_to_mat4(tm, mat);
+				if (parent_mat) {
+					mul_m4_m4m4(mat, parent_mat, mat);
+				}
 				return;
 			case COLLADAFW::Transformation::TRANSLATE:
 				dae_translate_to_mat4(tm, cur);
@@ -80,6 +97,10 @@ void TransformReader::get_node_mat(float mat[4][4], COLLADAFW::Node *node, std::
 			(*animation_map)[anim_list_id] = anim;
 		}
 	}
+
+	if (parent_mat) {
+		mul_m4_m4m4(mat, parent_mat, mat);
+	}
 }
 
 void TransformReader::dae_rotate_to_mat4(COLLADAFW::Transformation *tm, float m[4][4])
diff --git a/source/blender/collada/TransformReader.h b/source/blender/collada/TransformReader.h
index ab974b9ba85..08bb17ccac1 100644
--- a/source/blender/collada/TransformReader.h
+++ b/source/blender/collada/TransformReader.h
@@ -43,7 +43,7 @@
 
 //struct Object;
 
-class TransformReader : public TransformBase
+class TransformReader
 {
 protected:
 
@@ -59,7 +59,8 @@ public:
 	TransformReader(UnitConverter *conv);
 
 	void get_node_mat(float mat[4][4], COLLADAFW::Node *node, std::map<COLLADAFW::UniqueId, Animation> *animation_map, Object *ob);
-	
+	void get_node_mat(float mat[4][4], COLLADAFW::Node *node, std::map<COLLADAFW::UniqueId, Animation> *animation_map, Object *ob, float parent_mat[4][4]);
+
 	void dae_rotate_to_mat4(COLLADAFW::Transformation *tm, float m[4][4]);
 	void dae_translate_to_mat4(COLLADAFW::Transformation *tm, float m[4][4]);
 	void dae_scale_to_mat4(COLLADAFW::Transformation *tm, float m[4][4]);
diff --git a/source/blender/collada/TransformWriter.cpp b/source/blender/collada/TransformWriter.cpp
index 908111ebae6..b7eeff3b074 100644
--- a/source/blender/collada/TransformWriter.cpp
+++ b/source/blender/collada/TransformWriter.cpp
@@ -27,11 +27,10 @@
 
 
 #include "BKE_object.h"
+#include "BLI_math.h"
 
 #include "TransformWriter.h"
 
-#include "BLI_math.h"
-
 void TransformWriter::add_node_transform(COLLADASW::Node& node, float mat[4][4], float parent_mat[4][4])
 {
 	float loc[3], rot[3], scale[3];
@@ -51,7 +50,7 @@ void TransformWriter::add_node_transform(COLLADASW::Node& node, float mat[4][4],
 	converter->mat4_to_dae_double(dmat, local);
 	delete converter;
 
-	TransformBase::decompose(local, loc, rot, NULL, scale);
+	bc_decompose(local, loc, rot, NULL, scale);
 
 	if (node.getType() == COLLADASW::Node::JOINT) {
 		// XXX Why are joints handled differently ?
@@ -116,7 +115,7 @@ void TransformWriter::add_node_transform_ob(COLLADASW::Node& node, Object *ob, B
 		case BC_TRANSFORMATION_TYPE_TRANSROTLOC:
 		{
 			float loc[3], rot[3], scale[3];
-			TransformBase::decompose(f_obmat, loc, rot, NULL, scale);
+			bc_decompose(f_obmat, loc, rot, NULL, scale);
 			add_transform(node, loc, rot, scale); 
 			break;
 		}
diff --git a/source/blender/collada/TransformWriter.h b/source/blender/collada/TransformWriter.h
index 7f69a4b9c95..5bb13d4aac9 100644
--- a/source/blender/collada/TransformWriter.h
+++ b/source/blender/collada/TransformWriter.h
@@ -33,9 +33,10 @@
 #include "DNA_object_types.h"
 
 #include "collada_internal.h"
+#include "collada_utils.h"
 #include "collada.h"
 
-class TransformWriter : protected TransformBase
+class TransformWriter
 {
 protected:
 	void add_node_transform(COLLADASW::Node& node, float mat[4][4], float parent_mat[4][4]);
diff --git a/source/blender/collada/collada.cpp b/source/blender/collada/collada.cpp
index fe8b1d2320a..c0784ca8eb5 100644
--- a/source/blender/collada/collada.cpp
+++ b/source/blender/collada/collada.cpp
@@ -48,7 +48,8 @@ int collada_import(bContext *C,
 				   int find_chains,
 				   int auto_connect,
 				   int fix_orientation,
-				   int min_chain_length)
+				   int min_chain_length,
+				   int keep_bind_info)
 {
 
 	ImportSettings import_settings;
@@ -58,6 +59,7 @@ int collada_import(bContext *C,
 	import_settings.find_chains      = find_chains != 0;
 	import_settings.fix_orientation  = fix_orientation != 0;
 	import_settings.min_chain_length = min_chain_length;
+	import_settings.keep_bind_info = keep_bind_info;
 
 	DocumentImporter imp(C, &import_settings);
 	if (imp.import()) return 1;
@@ -87,7 +89,9 @@ int collada_export(Scene *sce,
 				   int use_blender_profile,
 				   int sort_by_name,
 				   BC_export_transformation_type export_transformation_type,
-                   int open_sim)
+				   int open_sim,
+
+				   int keep_bind_info)
 {
 	ExportSettings export_settings;
 
@@ -113,6 +117,7 @@ int collada_export(Scene *sce,
 	export_settings.export_transformation_type = export_transformation_type;
 	export_settings.open_sim                   = open_sim != 0;
 
+	export_settings.keep_bind_info = keep_bind_info;
 
 	int includeFilter = OB_REL_NONE;
 	if (export_settings.include_armatures) includeFilter |= OB_REL_MOD_ARMATURE;
diff --git a/source/blender/collada/collada.h b/source/blender/collada/collada.h
index a4416608584..09acc4a064b 100644
--- a/source/blender/collada/collada.h
+++ b/source/blender/collada/collada.h
@@ -58,7 +58,9 @@ int collada_import(struct bContext *C,
 				   int find_chains,
 				   int auto_connect,
 				   int fix_orientation,
-				   int min_chain_length);
+				   int min_chain_length,
+
+				   int keep_bind_info);
 
 int collada_export(struct Scene *sce,
                    const char *filepath,
@@ -81,9 +83,9 @@ int collada_export(struct Scene *sce,
                    int use_blender_profile,
                    int sort_by_name,
                    BC_export_transformation_type export_transformation_type,
-                   int open_sim);
-
 
+				   int open_sim,
+				   int keep_bind_info);
 
 #ifdef __cplusplus
 }
diff --git a/source/blender/collada/collada_internal.cpp b/source/blender/collada/collada_internal.cpp
index e1a13559b08..6ebde6bd773 100644
--- a/source/blender/collada/collada_internal.cpp
+++ b/source/blender/collada/collada_internal.cpp
@@ -162,18 +162,6 @@ void UnitConverter::calculate_scale(Scene &sce)
 	size_to_mat4(scale_mat4, rescale);
 }
 
-void TransformBase::decompose(float mat[4][4], float *loc, float eul[3], float quat[4], float *size)
-{
-	mat4_to_size(size, mat);
-	if (eul) {
-		mat4_to_eul(eul, mat);
-	}
-	if (quat) {
-		mat4_to_quat(quat, mat);
-	}
-	copy_v3_v3(loc, mat[3]);
-}
-
 /**
  * Translation map.
  * Used to translate every COLLADA id to a valid id, no matter what "wrong" letters may be
@@ -341,7 +329,12 @@ std::string get_light_id(Object *ob)
 
 std::string get_joint_id(Bone *bone, Object *ob_arm)
 {
-	return translate_id(/*id_name(ob_arm) + "_" +*/ bone->name);
+	return translate_id(id_name(ob_arm) + "_" + bone->name);
+}
+
+std::string get_joint_sid(Bone *bone, Object *ob_arm)
+{
+	return translate_id(bone->name);
 }
 
 std::string get_camera_id(Object *ob)
diff --git a/source/blender/collada/collada_internal.h b/source/blender/collada/collada_internal.h
index 4aa637a6876..1c7aa160f57 100644
--- a/source/blender/collada/collada_internal.h
+++ b/source/blender/collada/collada_internal.h
@@ -85,12 +85,6 @@ public:
 
 };
 
-class TransformBase
-{
-public:
-	void decompose(float mat[4][4], float *loc, float eul[3], float quat[4], float *size);
-};
-
 extern void clear_global_id_map();
 /** Look at documentation of translate_map */
 extern std::string translate_id(const std::string &id);
@@ -104,6 +98,7 @@ extern std::string get_geometry_id(Object *ob, bool use_instantiation);
 extern std::string get_light_id(Object *ob);
 
 extern std::string get_joint_id(Bone *bone, Object *ob_arm);
+extern std::string get_joint_sid(Bone *bone, Object *ob_arm);
 
 extern std::string get_camera_id(Object *ob);
 
diff --git a/source/blender/collada/collada_utils.cpp b/source/blender/collada/collada_utils.cpp
index 2efa8b21d81..ac4395e1430 100644
--- a/source/blender/collada/collada_utils.cpp
+++ b/source/blender/collada/collada_utils.cpp
@@ -32,8 +32,6 @@
 #include "COLLADAFWMeshPrimitive.h"
 #include "COLLADAFWMeshVertexData.h"
 
-#include "collada_utils.h"
-
 extern "C" {
 #include "DNA_modifier_types.h"
 #include "DNA_customdata_types.h"
@@ -63,6 +61,9 @@ extern "C" {
 #include "bmesh_tools.h"
 }
 
+#include "collada_utils.h"
+#include "ExportSettings.h"
+
 float bc_get_float_value(const COLLADAFW::FloatOrDoubleArray& array, unsigned int index)
 {
 	if (index >= array.getValuesCount())
@@ -352,6 +353,28 @@ void bc_match_scale(std::vector<Object *> *objects_done,
 	}
 }
 
+/*
+    Convenience function to get only the needed components of a matrix
+*/
+void bc_decompose(float mat[4][4], float *loc, float eul[3], float quat[4], float *size)
+{
+	if (size) {
+		mat4_to_size(size, mat);
+	}
+
+	if (eul) {
+		mat4_to_eul(eul, mat);
+	}
+
+	if (quat) {
+		mat4_to_quat(quat, mat);
+	}
+
+	if (loc) {
+		copy_v3_v3(loc, mat[3]);
+	}
+}
+
 void bc_triangulate_mesh(Mesh *me)
 {
 	bool use_beauty  = false;
@@ -612,3 +635,200 @@ int BoneExtended::get_use_connect()
 {
 	return this->use_connect;
 }
+
+/**
+* Stores a 4*4 matrix as a custom bone property array of size 16
+*/
+void bc_set_IDPropertyMatrix(EditBone *ebone, const char *key, float mat[4][4])
+{
+	IDProperty *idgroup = (IDProperty *)ebone->prop;
+	if (idgroup == NULL)
+	{
+		IDPropertyTemplate val = { 0 };
+		idgroup = IDP_New(IDP_GROUP, &val, "RNA_EditBone ID properties");
+		ebone->prop = idgroup;
+	}
+
+	IDPropertyTemplate val = { 0 };
+	val.array.len = 16;
+	val.array.type = IDP_FLOAT;
+
+	IDProperty *data = IDP_New(IDP_ARRAY, &val, key);
+	float *array = (float *)IDP_Array(data);
+	for (int i = 0; i < 4; i++)
+		for (int j = 0; j < 4; j++)
+			array[4 * i + j] = mat[i][j];
+
+	IDP_AddToGroup(idgroup, data);
+}
+
+#if 0
+/**
+* Stores a Float value as a custom bone property
+*
+* Note: This function is currently not needed. Keep for future usage
+*/
+static void bc_set_IDProperty(EditBone *ebone, const char *key, float value)
+{
+	if (ebone->prop == NULL)
+	{
+		IDPropertyTemplate val = { 0 };
+		ebone->prop = IDP_New(IDP_GROUP, &val, "RNA_EditBone ID properties");
+	}
+
+	IDProperty *pgroup = (IDProperty *)ebone->prop;
+	IDPropertyTemplate val = { 0 };
+	IDProperty *prop = IDP_New(IDP_FLOAT, &val, key);
+	IDP_Float(prop) = value;
+	IDP_AddToGroup(pgroup, prop);
+
+}
+#endif
+
+/*
+* Get a custom property when it exists.
+* This function is also used to check if a property exists.
+*/
+IDProperty *bc_get_IDProperty(Bone *bone, std::string key)
+{
+	return (bone->prop == NULL) ? NULL : IDP_GetPropertyFromGroup(bone->prop, key.c_str());
+}
+
+/**
+* Read a custom bone property and convert to float
+* Return def if the property does not exist.
+*/
+float bc_get_property(Bone *bone, std::string key, float def)
+{
+	float result = def;
+	IDProperty *property = bc_get_IDProperty(bone, key);
+	if (property) {
+		switch (property->type) {
+		case IDP_INT:
+			result = (float)(IDP_Int(property));
+			break;
+		case IDP_FLOAT:
+			result = (float)(IDP_Float(property));
+			break;
+		case IDP_DOUBLE:
+			result = (float)(IDP_Double(property));
+			break;
+		default:
+			result = def;
+		}
+	}
+	return result;
+}
+
+/**
+* Read a custom bone property and convert to matrix
+* Return true if conversion was succesfull
+* 
+* Return false if:
+* - the property does not exist
+* - is not an array of size 16
+*/
+bool bc_get_property_matrix(Bone *bone, std::string key, float mat[4][4])
+{
+	IDProperty *property = bc_get_IDProperty(bone, key);
+	if (property && property->type == IDP_ARRAY && property->len == 16) {
+		float *array = (float *)IDP_Array(property);
+		for (int i = 0; i < 4; i++)
+			for (int j = 0; j < 4; j++)
+				mat[i][j] = array[4 * i + j];
+		return true;
+	}
+	return false;
+}
+
+/**
+* get a vector that is stored in 3 custom properties (used in Blender <= 2.78)
+*/
+void bc_get_property_vector(Bone *bone, std::string key, float val[3], const float def[3])
+{
+	val[0] = bc_get_property(bone, key + "_x", def[0]);
+	val[1] = bc_get_property(bone, key + "_y", def[1]);
+	val[2] = bc_get_property(bone, key + "_z", def[2]);
+}
+
+/**
+* Check if vector exist stored in 3 custom properties (used in Blender <= 2.78)
+*/
+static bool has_custom_props(Bone *bone, bool enabled, std::string key)
+{
+	if (!enabled)
+		return false;
+
+	return (bc_get_IDProperty(bone, key + "_x")
+		||	bc_get_IDProperty(bone, key + "_y")
+		||	bc_get_IDProperty(bone, key + "_z"));
+
+}
+
+/**
+* Check if custom information about bind matrix exists and modify the from_mat
+* accordingly.
+*
+* Note: This is old style for Blender <= 2.78 only kept for compatibility
+*/
+void bc_create_restpose_mat(const ExportSettings *export_settings, Bone *bone, float to_mat[4][4], float from_mat[4][4], bool use_local_space)
+{
+	float loc[3];
+	float rot[3];
+	float scale[3];
+	static const float V0[3] = { 0, 0, 0 };
+
+	if (!has_custom_props(bone, export_settings->keep_bind_info, "restpose_loc") &&
+		!has_custom_props(bone, export_settings->keep_bind_info, "restpose_rot") &&
+		!has_custom_props(bone, export_settings->keep_bind_info, "restpose_scale"))
+	{
+		/* No need */
+		copy_m4_m4(to_mat, from_mat);
+		return;
+	}
+
+	bc_decompose(from_mat, loc, rot, NULL, scale);
+	loc_eulO_size_to_mat4(to_mat, loc, rot, scale, 6);
+
+	if (export_settings->keep_bind_info) {
+		bc_get_property_vector(bone, "restpose_loc", loc, loc);
+
+		if (use_local_space && bone->parent) {
+			Bone *b = bone;
+			while (b->parent) {
+				b = b->parent;
+				float ploc[3];
+				bc_get_property_vector(b, "restpose_loc", ploc, V0);
+				loc[0] += ploc[0];
+				loc[1] += ploc[1];
+				loc[2] += ploc[2];
+			}
+		}
+	}
+
+	if (export_settings->keep_bind_info) {
+		if (bc_get_IDProperty(bone, "restpose_rot_x"))
+		    rot[0] = DEG2RADF(bc_get_property(bone, "restpose_rot_x", 0));
+		if (bc_get_IDProperty(bone, "restpose_rot_y"))
+			rot[1] = DEG2RADF(bc_get_property(bone, "restpose_rot_y", 0));
+		if (bc_get_IDProperty(bone, "restpose_rot_z"))
+			rot[2] = DEG2RADF(bc_get_property(bone, "restpose_rot_z", 0));
+	}
+
+	if (export_settings->keep_bind_info) {
+		bc_get_property_vector(bone, "restpose_scale", scale, scale);
+	}
+
+	loc_eulO_size_to_mat4(to_mat, loc, rot, scale, 6);
+
+}
+
+/*
+    Make 4*4 matrices better readable
+*/
+void bc_sanitize_mat(float mat[4][4], int precision)
+{
+	for (int i = 0; i < 4; i++)
+		for (int j = 0; j < 4; j++)
+			mat[i][j] = double_round(mat[i][j], precision);
+}
+\ No newline at end of file
diff --git a/source/blender/collada/collada_utils.h b/source/blender/collada/collada_utils.h
index 7fdbef3b6cb..38c0bd5096a 100644
--- a/source/blender/collada/collada_utils.h
+++ b/source/blender/collada/collada_utils.h
@@ -53,8 +53,10 @@ extern "C" {
 #include "BKE_object.h"
 #include "BKE_DerivedMesh.h"
 #include "BKE_scene.h"
+#include "BKE_idprop.h"
 }
 
+#include "ImportSettings.h"
 #include "ExportSettings.h"
 #include "collada_internal.h"
 
@@ -88,11 +90,24 @@ extern std::string bc_url_encode(std::string data);
 extern void bc_match_scale(Object *ob, UnitConverter &bc_unit, bool scale_to_scene);
 extern void bc_match_scale(std::vector<Object *> *objects_done, UnitConverter &unit_converter, bool scale_to_scene);
 
+extern void bc_decompose(float mat[4][4], float *loc, float eul[3], float quat[4], float *size);
+
 extern void bc_triangulate_mesh(Mesh *me);
 extern bool bc_is_leaf_bone(Bone *bone);
 extern EditBone *bc_get_edit_bone(bArmature * armature, char *name);
 extern int bc_set_layer(int bitfield, int layer, bool enable);
 extern int bc_set_layer(int bitfield, int layer);
+extern void bc_sanitize_mat(float mat[4][4], int precision);
+
+extern IDProperty *bc_get_IDProperty(Bone *bone, std::string key);
+extern void bc_set_IDProperty(EditBone *ebone, const char *key, float value);
+extern void bc_set_IDPropertyMatrix(EditBone *ebone, const char *key, float mat[4][4]);
+
+extern float bc_get_property(Bone *bone, std::string key, float def);
+extern void bc_get_property_vector(Bone *bone, std::string key, float val[3], const float def[3]);
+extern bool bc_get_property_matrix(Bone *bone, std::string key, float mat[4][4]);
+
+extern void bc_create_restpose_mat(const ExportSettings *export_settings, Bone *bone, float to_mat[4][4], float world[4][4], bool use_local_space);
 
 class BCPolygonNormalsIndices
 {
diff --git a/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp b/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
index e1ada9a8c39..5f78067220a 100644
--- a/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
+++ b/source/blender/compositor/operations/COM_ConvolutionEdgeFilterOperation.cpp
@@ -94,4 +94,10 @@ void ConvolutionEdgeFilterOperation::executePixel(float output[4], int x, int y,
 	output[2] = output[2] * value[0] + in2[2] * mval;
 	
 	output[3] = in2[3];
+
+	/* Make sure we don't return negative color. */
+	output[0] = max(output[0], 0.0f);
+	output[1] = max(output[1], 0.0f);
+	output[2] = max(output[2], 0.0f);
+	output[3] = max(output[3], 0.0f);
 }
diff --git a/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp b/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
index 68ec2be5ebd..6ac1ff9a1eb 100644
--- a/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
+++ b/source/blender/compositor/operations/COM_ConvolutionFilterOperation.cpp
@@ -107,6 +107,12 @@ void ConvolutionFilterOperation::executePixel(float output[4], int x, int y, voi
 	output[1] = output[1] * value[0] + in2[1] * mval;
 	output[2] = output[2] * value[0] + in2[2] * mval;
 	output[3] = output[3] * value[0] + in2[3] * mval;
+
+	/* Make sure we don't return negative color. */
+	output[0] = max(output[0], 0.0f);
+	output[1] = max(output[1], 0.0f);
+	output[2] = max(output[2], 0.0f);
+	output[3] = max(output[3], 0.0f);
 }
 
 bool ConvolutionFilterOperation::determineDependingAreaOfInterest(rcti *input, ReadBufferOperation *readOperation, rcti *output)
diff --git a/source/blender/compositor/operations/COM_GlareSimpleStarOperation.cpp b/source/blender/compositor/operations/COM_GlareSimpleStarOperation.cpp
index 957ac5af748..57aa3a1bac2 100644
--- a/source/blender/compositor/operations/COM_GlareSimpleStarOperation.cpp
+++ b/source/blender/compositor/operations/COM_GlareSimpleStarOperation.cpp
@@ -44,18 +44,18 @@ void GlareSimpleStarOperation::generateGlare(float *data, MemoryBuffer *inputTil
 				xp = x + i;
 				tbuf1->read(c, x, y);
 				mul_v3_fl(c, f1);
-				tbuf1->read(tc, (settings->angle ? xm : x), ym);
+				tbuf1->read(tc, (settings->star_45 ? xm : x), ym);
 				madd_v3_v3fl(c, tc, f2);
-				tbuf1->read(tc, (settings->angle ? xp : x), yp);
+				tbuf1->read(tc, (settings->star_45 ? xp : x), yp);
 				madd_v3_v3fl(c, tc, f2);
 				c[3] = 1.0f;
 				tbuf1->writePixel(x, y, c);
 
 				tbuf2->read(c, x, y);
 				mul_v3_fl(c, f1);
-				tbuf2->read(tc, xm, (settings->angle ? yp : y));
+				tbuf2->read(tc, xm, (settings->star_45 ? yp : y));
 				madd_v3_v3fl(c, tc, f2);
-				tbuf2->read(tc, xp, (settings->angle ? ym : y));
+				tbuf2->read(tc, xp, (settings->star_45 ? ym : y));
 				madd_v3_v3fl(c, tc, f2);
 				c[3] = 1.0f;
 				tbuf2->writePixel(x, y, c);
@@ -73,18 +73,18 @@ void GlareSimpleStarOperation::generateGlare(float *data, MemoryBuffer *inputTil
 				xp = x + i;
 				tbuf1->read(c, x, y);
 				mul_v3_fl(c, f1);
-				tbuf1->read(tc, (settings->angle ? xm : x), ym);
+				tbuf1->read(tc, (settings->star_45 ? xm : x), ym);
 				madd_v3_v3fl(c, tc, f2);
-				tbuf1->read(tc, (settings->angle ? xp : x), yp);
+				tbuf1->read(tc, (settings->star_45 ? xp : x), yp);
 				madd_v3_v3fl(c, tc, f2);
 				c[3] = 1.0f;
 				tbuf1->writePixel(x, y, c);
 
 				tbuf2->read(c, x, y);
 				mul_v3_fl(c, f1);
-				tbuf2->read(tc, xm, (settings->angle ? yp : y));
+				tbuf2->read(tc, xm, (settings->star_45 ? yp : y));
 				madd_v3_v3fl(c, tc, f2);
-				tbuf2->read(tc, xp, (settings->angle ? ym : y));
+				tbuf2->read(tc, xp, (settings->star_45 ? ym : y));
 				madd_v3_v3fl(c, tc, f2);
 				c[3] = 1.0f;
 				tbuf2->writePixel(x, y, c);
diff --git a/source/blender/compositor/operations/COM_GlareStreaksOperation.cpp b/source/blender/compositor/operations/COM_GlareStreaksOperation.cpp
index da6076337b4..535f2952e5d 100644
--- a/source/blender/compositor/operations/COM_GlareStreaksOperation.cpp
+++ b/source/blender/compositor/operations/COM_GlareStreaksOperation.cpp
@@ -28,7 +28,7 @@ void GlareStreaksOperation::generateGlare(float *data, MemoryBuffer *inputTile,
 	int x, y, n;
 	unsigned int nump = 0;
 	float c1[4], c2[4], c3[4], c4[4];
-	float a, ang = DEG2RADF(360.0f) / (float)settings->angle;
+	float a, ang = DEG2RADF(360.0f) / (float)settings->streaks;
 
 	int size = inputTile->getWidth() * inputTile->getHeight();
 	int size4 = size * 4;
diff --git a/source/blender/depsgraph/intern/builder/deg_builder.cc b/source/blender/depsgraph/intern/builder/deg_builder.cc
index cb2f057a090..828da6cb056 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder.cc
@@ -77,28 +77,8 @@ static bool check_object_needs_evaluation(Object *object)
 	return false;
 }
 
-void deg_graph_build_finalize(Depsgraph *graph)
+void deg_graph_build_flush_layers(Depsgraph *graph)
 {
-	/* STEP 1: Make sure new invisible dependencies are ready for use.
-	 *
-	 * TODO(sergey): This might do a bit of extra tagging, but it's kinda nice
-	 * to do it ahead of a time and don't spend time on flushing updates on
-	 * every frame change.
-	 */
-	GHASH_FOREACH_BEGIN(IDDepsNode *, id_node, graph->id_hash)
-	{
-		if (id_node->layers == 0) {
-			ID *id = id_node->id;
-			if (GS(id->name) == ID_OB) {
-				Object *object = (Object *)id;
-				if (check_object_needs_evaluation(object)) {
-					id_node->tag_update(graph);
-				}
-			}
-		}
-	}
-	GHASH_FOREACH_END();
-	/* STEP 2: Flush visibility layers from children to parent. */
 	std::stack<OperationDepsNode *> stack;
 	foreach (OperationDepsNode *node, graph->operations) {
 		IDDepsNode *id_node = node->owner->owner;
@@ -143,6 +123,31 @@ void deg_graph_build_finalize(Depsgraph *graph)
 			}
 		}
 	}
+}
+
+void deg_graph_build_finalize(Depsgraph *graph)
+{
+	/* STEP 1: Make sure new invisible dependencies are ready for use.
+	 *
+	 * TODO(sergey): This might do a bit of extra tagging, but it's kinda nice
+	 * to do it ahead of a time and don't spend time on flushing updates on
+	 * every frame change.
+	 */
+	GHASH_FOREACH_BEGIN(IDDepsNode *, id_node, graph->id_hash)
+	{
+		if (id_node->layers == 0) {
+			ID *id = id_node->id;
+			if (GS(id->name) == ID_OB) {
+				Object *object = (Object *)id;
+				if (check_object_needs_evaluation(object)) {
+					id_node->tag_update(graph);
+				}
+			}
+		}
+	}
+	GHASH_FOREACH_END();
+	/* STEP 2: Flush visibility layers from children to parent. */
+	deg_graph_build_flush_layers(graph);
 	/* STEP 3: Re-tag IDs for update if it was tagged before the relations
 	 * update tag.
 	 */
@@ -154,7 +159,7 @@ void deg_graph_build_finalize(Depsgraph *graph)
 		}
 		GHASH_FOREACH_END();
 
-		if ((id_node->layers & graph->layers) != 0) {
+		if ((id_node->layers & graph->layers) != 0 || graph->layers == 0) {
 			ID *id = id_node->id;
 			if ((id->tag & LIB_TAG_ID_RECALC_ALL) &&
 			    (id->tag & LIB_TAG_DOIT))
diff --git a/source/blender/depsgraph/intern/builder/deg_builder.h b/source/blender/depsgraph/intern/builder/deg_builder.h
index bdc030e3810..3cc51a2d7db 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder.h
+++ b/source/blender/depsgraph/intern/builder/deg_builder.h
@@ -42,5 +42,6 @@ struct Depsgraph;
 string deg_fcurve_id_name(const FCurve *fcu);
 
 void deg_graph_build_finalize(struct Depsgraph *graph);
+void deg_graph_build_flush_layers(struct Depsgraph *graph);
 
 }  // namespace DEG
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_nodes.cc b/source/blender/depsgraph/intern/builder/deg_builder_nodes.cc
index 6b7fb5246ca..7bcaee61fe0 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_nodes.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_nodes.cc
@@ -432,7 +432,7 @@ void DepsgraphNodeBuilder::build_object(Scene *scene, Base *base, Object *ob)
 	}
 
 	/* Object data. */
-	if (ob->data) {
+	if (ob->data != NULL) {
 		/* type-specific data... */
 		switch (ob->type) {
 			case OB_MESH:     /* Geometry */
@@ -490,14 +490,25 @@ void DepsgraphNodeBuilder::build_object(Scene *scene, Base *base, Object *ob)
 	build_animdata(&ob->id);
 
 	/* particle systems */
-	if (ob->particlesystem.first) {
+	if (ob->particlesystem.first != NULL) {
 		build_particles(scene, ob);
 	}
 
-	/* grease pencil */
-	if (ob->gpd) {
+	/* Grease pencil. */
+	if (ob->gpd != NULL) {
 		build_gpencil(ob->gpd);
 	}
+
+	/* Object that this is a proxy for. */
+	if (ob->proxy) {
+		ob->proxy->proxy_from = ob;
+		build_object(scene, base, ob->proxy);
+	}
+
+	/* Object dupligroup. */
+	if (ob->dup_group != NULL) {
+		build_group(scene, base, ob->dup_group);
+	}
 }
 
 void DepsgraphNodeBuilder::build_object_transform(Scene *scene, Object *ob)
@@ -734,7 +745,8 @@ void DepsgraphNodeBuilder::build_particles(Scene *scene, Object *ob)
 	 */
 
 	/* component for all particle systems */
-	ComponentDepsNode *psys_comp = add_component_node(&ob->id, DEPSNODE_TYPE_EVAL_PARTICLES);
+	ComponentDepsNode *psys_comp =
+	        add_component_node(&ob->id, DEPSNODE_TYPE_EVAL_PARTICLES);
 
 	/* particle systems */
 	LINKLIST_FOREACH (ParticleSystem *, psys, &ob->particlesystem) {
@@ -747,11 +759,12 @@ void DepsgraphNodeBuilder::build_particles(Scene *scene, Object *ob)
 		/* this particle system */
 		// TODO: for now, this will just be a placeholder "ubereval" node
 		add_operation_node(psys_comp,
-		                   DEPSOP_TYPE_EXEC, function_bind(BKE_particle_system_eval,
-		                                                   _1,
-		                                                   scene,
-		                                                   ob,
-		                                                   psys),
+		                   DEPSOP_TYPE_EXEC,
+		                   function_bind(BKE_particle_system_eval,
+		                                 _1,
+		                                 scene,
+		                                 ob,
+		                                 psys),
 		                   DEG_OPCODE_PSYS_EVAL,
 		                   psys->name);
 	}
@@ -760,6 +773,20 @@ void DepsgraphNodeBuilder::build_particles(Scene *scene, Object *ob)
 	// TODO...
 }
 
+void DepsgraphNodeBuilder::build_cloth(Scene *scene, Object *object)
+{
+	ComponentDepsNode *cache_comp = add_component_node(&object->id,
+	                                                   DEPSNODE_TYPE_CACHE);
+	add_operation_node(cache_comp,
+	                   DEPSOP_TYPE_EXEC,
+	                   function_bind(BKE_object_eval_cloth,
+	                                 _1,
+	                                 scene,
+	                                 object),
+	                   DEG_OPCODE_PLACEHOLDER,
+	                   "Cloth Modifier");
+}
+
 /* Shapekeys */
 void DepsgraphNodeBuilder::build_shapekeys(Key *key)
 {
@@ -821,6 +848,9 @@ void DepsgraphNodeBuilder::build_obdata_geom(Scene *scene, Object *ob)
 		                                 md),
 		                   DEG_OPCODE_GEOMETRY_MODIFIER,
 		                   md->name);
+		if (md->type == eModifierType_Cloth) {
+			build_cloth(scene, ob);
+		}
 	}
 
 	/* materials */
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_nodes.h b/source/blender/depsgraph/intern/builder/deg_builder_nodes.h
index c5035f35f6e..10b586342dd 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_nodes.h
+++ b/source/blender/depsgraph/intern/builder/deg_builder_nodes.h
@@ -133,6 +133,7 @@ struct DepsgraphNodeBuilder {
 	void build_pose_constraints(Object *ob, bPoseChannel *pchan);
 	void build_rigidbody(Scene *scene);
 	void build_particles(Scene *scene, Object *ob);
+	void build_cloth(Scene *scene, Object *object);
 	void build_animdata(ID *id);
 	OperationDepsNode *build_driver(ID *id, FCurve *fcurve);
 	void build_ik_pose(Scene *scene,
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_nodes_scene.cc b/source/blender/depsgraph/intern/builder/deg_builder_nodes_scene.cc
index 99e61692231..7dd694cb570 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_nodes_scene.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_nodes_scene.cc
@@ -81,21 +81,7 @@ void DepsgraphNodeBuilder::build_scene(Main *bmain, Scene *scene)
 	/* scene objects */
 	LINKLIST_FOREACH (Base *, base, &scene->base) {
 		Object *ob = base->object;
-
-		/* object itself */
 		build_object(scene, base, ob);
-
-		/* object that this is a proxy for */
-		// XXX: the way that proxies work needs to be completely reviewed!
-		if (ob->proxy) {
-			ob->proxy->proxy_from = ob;
-			build_object(scene, base, ob->proxy);
-		}
-
-		/* Object dupligroup. */
-		if (ob->dup_group) {
-			build_group(scene, base, ob->dup_group);
-		}
 	}
 
 	/* rigidbody */
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations.cc b/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
index 94cff521edc..7f2c6d8d9a1 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations.cc
@@ -396,30 +396,52 @@ void DepsgraphRelationBuilder::build_object(Main *bmain, Scene *scene, Object *o
 	OperationKey ob_ubereval_key(&ob->id, DEPSNODE_TYPE_TRANSFORM, DEG_OPCODE_OBJECT_UBEREVAL);
 
 	/* parenting */
-	if (ob->parent) {
+	if (ob->parent != NULL) {
 		/* parent relationship */
 		build_object_parent(ob);
 
 		/* local -> parent */
-		add_relation(local_transform_key, parent_transform_key, DEPSREL_TYPE_COMPONENT_ORDER, "[ObLocal -> ObParent]");
+		add_relation(local_transform_key,
+		             parent_transform_key,
+		             DEPSREL_TYPE_COMPONENT_ORDER,
+		             "[ObLocal -> ObParent]");
 	}
 
 	/* object constraints */
-	if (ob->constraints.first) {
-		OperationKey constraint_key(&ob->id, DEPSNODE_TYPE_TRANSFORM, DEG_OPCODE_TRANSFORM_CONSTRAINTS);
+	if (ob->constraints.first != NULL) {
+		OperationKey constraint_key(&ob->id,
+		                            DEPSNODE_TYPE_TRANSFORM,
+		                            DEG_OPCODE_TRANSFORM_CONSTRAINTS);
 
 		/* constraint relations */
 		// TODO: provide base op
 		// XXX: this is broken
-		build_constraints(scene, &ob->id, DEPSNODE_TYPE_TRANSFORM, "", &ob->constraints, NULL);
+		build_constraints(scene,
+		                  &ob->id,
+		                  DEPSNODE_TYPE_TRANSFORM,
+		                  "",
+		                  &ob->constraints,
+		                  NULL);
 
 		/* operation order */
-		add_relation(base_op_key, constraint_key, DEPSREL_TYPE_COMPONENT_ORDER, "[ObBase-> Constraint Stack]");
-		add_relation(constraint_key, final_transform_key, DEPSREL_TYPE_COMPONENT_ORDER, "[ObConstraints -> Done]");
+		add_relation(base_op_key,
+		             constraint_key,
+		             DEPSREL_TYPE_COMPONENT_ORDER,
+		             "[ObBase-> Constraint Stack]");
+		add_relation(constraint_key,
+		             final_transform_key,
+		             DEPSREL_TYPE_COMPONENT_ORDER,
+		             "[ObConstraints -> Done]");
 
 		// XXX
-		add_relation(constraint_key, ob_ubereval_key, DEPSREL_TYPE_COMPONENT_ORDER, "Temp Ubereval");
-		add_relation(ob_ubereval_key, final_transform_key, DEPSREL_TYPE_COMPONENT_ORDER, "Temp Ubereval");
+		add_relation(constraint_key,
+		             ob_ubereval_key,
+		             DEPSREL_TYPE_COMPONENT_ORDER,
+		             "Temp Ubereval");
+		add_relation(ob_ubereval_key,
+		             final_transform_key,
+		             DEPSREL_TYPE_COMPONENT_ORDER,
+		             "Temp Ubereval");
 	}
 	else {
 		/* NOTE: Keep an eye here, we skip some relations here to "streamline"
@@ -448,7 +470,10 @@ void DepsgraphRelationBuilder::build_object(Main *bmain, Scene *scene, Object *o
 	// XXX: This should be hooked up by the build_animdata code
 	if (needs_animdata_node(&ob->id)) {
 		ComponentKey adt_key(&ob->id, DEPSNODE_TYPE_ANIMATION);
-		add_relation(adt_key, local_transform_key, DEPSREL_TYPE_OPERATION, "Object Animation");
+		add_relation(adt_key,
+		             local_transform_key,
+		             DEPSREL_TYPE_OPERATION,
+		             "Object Animation");
 	}
 
 
@@ -494,25 +519,48 @@ void DepsgraphRelationBuilder::build_object(Main *bmain, Scene *scene, Object *o
 		if (key != NULL) {
 			ComponentKey geometry_key((ID *)ob->data, DEPSNODE_TYPE_GEOMETRY);
 			ComponentKey key_key(&key->id, DEPSNODE_TYPE_GEOMETRY);
-			add_relation(key_key, geometry_key, DEPSREL_TYPE_GEOMETRY_EVAL, "Shapekeys");
+			add_relation(key_key,
+			             geometry_key,
+			             DEPSREL_TYPE_GEOMETRY_EVAL,
+			             "Shapekeys");
 		}
 	}
 
-	/* particle systems */
-	if (ob->particlesystem.first) {
+	/* Particle systems. */
+	if (ob->particlesystem.first != NULL) {
 		build_particles(scene, ob);
 	}
 
-	/* grease pencil */
-	if (ob->gpd) {
+	/* Grease pencil. */
+	if (ob->gpd != NULL) {
 		build_gpencil(ob->gpd);
 	}
+
+	/* Object that this is a proxy for. */
+	if (ob->proxy != NULL) {
+		ob->proxy->proxy_from = ob;
+		build_object(bmain, scene, ob->proxy);
+		/* TODO(sergey): This is an inverted relation, matches old depsgraph
+		 * behavior and need to be investigated if it still need to be inverted.
+		 */
+		ComponentKey ob_pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
+		ComponentKey proxy_pose_key(&ob->proxy->id, DEPSNODE_TYPE_EVAL_POSE);
+		add_relation(ob_pose_key, proxy_pose_key, DEPSREL_TYPE_TRANSFORM, "Proxy");
+	}
+
+	/* Object dupligroup. */
+	if (ob->dup_group != NULL) {
+		build_group(bmain, scene, ob, ob->dup_group);
+	}
 }
 
 void DepsgraphRelationBuilder::build_object_parent(Object *ob)
 {
-	/* XXX: for now, need to use the component key (not just direct to the parent op), or else the matrix doesn't get reset */
-	// XXX: @sergey - it would be good if we got that backwards flushing working when tagging for updates
+	/* XXX: for now, need to use the component key (not just direct to the parent op),
+	 * or else the matrix doesn't get reset/
+	 */
+	// XXX: @sergey - it would be good if we got that backwards flushing working
+	// when tagging for updates.
 	//OperationKey ob_key(&ob->id, DEPSNODE_TYPE_TRANSFORM, DEG_OPCODE_TRANSFORM_PARENT);
 	ComponentKey ob_key(&ob->id, DEPSNODE_TYPE_TRANSFORM);
 
@@ -1312,6 +1360,25 @@ void DepsgraphRelationBuilder::build_particles(Scene *scene, Object *ob)
 	// TODO...
 }
 
+void DepsgraphRelationBuilder::build_cloth(Scene * /*scene*/,
+                                           Object *object,
+                                           ModifierData *md)
+{
+	OperationKey cache_key(&object->id,
+	                       DEPSNODE_TYPE_CACHE,
+	                       DEG_OPCODE_PLACEHOLDER,
+	                       "Cloth Modifier");
+	/* Cache component affects on modifier. */
+	OperationKey modifier_key(&object->id,
+	                          DEPSNODE_TYPE_GEOMETRY,
+	                          DEG_OPCODE_GEOMETRY_MODIFIER,
+	                          md->name);
+	add_relation(cache_key,
+	             modifier_key,
+	             DEPSREL_TYPE_TIME,
+	             "Cloth Cache -> Cloth");
+}
+
 /* Shapekeys */
 void DepsgraphRelationBuilder::build_shapekeys(ID *obdata, Key *key)
 {
@@ -1413,6 +1480,10 @@ void DepsgraphRelationBuilder::build_obdata_geom(Main *bmain, Scene *scene, Obje
 				}
 			}
 
+			if (md->type == eModifierType_Cloth) {
+				build_cloth(scene, ob, md);
+			}
+
 			prev_mod_key = mod_key;
 		}
 	}
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations.h b/source/blender/depsgraph/intern/builder/deg_builder_relations.h
index 054e4103290..4ca95bebe3f 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_relations.h
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations.h
@@ -58,6 +58,7 @@ struct Main;
 struct Mask;
 struct Material;
 struct MTex;
+struct ModifierData;
 struct MovieClip;
 struct bNodeTree;
 struct Object;
@@ -205,6 +206,7 @@ struct DepsgraphRelationBuilder
 	void build_world(World *world);
 	void build_rigidbody(Scene *scene);
 	void build_particles(Scene *scene, Object *ob);
+	void build_cloth(Scene *scene, Object *object, ModifierData *md);
 	void build_ik_pose(Object *ob,
 	                   bPoseChannel *pchan,
 	                   bConstraint *con,
diff --git a/source/blender/depsgraph/intern/builder/deg_builder_relations_scene.cc b/source/blender/depsgraph/intern/builder/deg_builder_relations_scene.cc
index 8a3476cff45..3bf435c37e0 100644
--- a/source/blender/depsgraph/intern/builder/deg_builder_relations_scene.cc
+++ b/source/blender/depsgraph/intern/builder/deg_builder_relations_scene.cc
@@ -76,26 +76,7 @@ void DepsgraphRelationBuilder::build_scene(Main *bmain, Scene *scene)
 	/* scene objects */
 	LINKLIST_FOREACH (Base *, base, &scene->base) {
 		Object *ob = base->object;
-
-		/* object itself */
 		build_object(bmain, scene, ob);
-
-		/* object that this is a proxy for */
-		if (ob->proxy) {
-			ob->proxy->proxy_from = ob;
-			build_object(bmain, scene, ob->proxy);
-			/* TODO(sergey): This is an inverted relation, matches old depsgraph
-			 * behavior and need to be investigated if it still need to be inverted.
-			 */
-			ComponentKey ob_pose_key(&ob->id, DEPSNODE_TYPE_EVAL_POSE);
-			ComponentKey proxy_pose_key(&ob->proxy->id, DEPSNODE_TYPE_EVAL_POSE);
-			add_relation(ob_pose_key, proxy_pose_key, DEPSREL_TYPE_TRANSFORM, "Proxy");
-		}
-
-		/* Object dupligroup. */
-		if (ob->dup_group) {
-			build_group(bmain, scene, ob, ob->dup_group);
-		}
 	}
 
 	/* rigidbody */
diff --git a/source/blender/depsgraph/intern/depsgraph_tag.cc b/source/blender/depsgraph/intern/depsgraph_tag.cc
index e8ed03666a6..cfc9005a1e3 100644
--- a/source/blender/depsgraph/intern/depsgraph_tag.cc
+++ b/source/blender/depsgraph/intern/depsgraph_tag.cc
@@ -56,8 +56,8 @@ extern "C" {
 #include "DEG_depsgraph.h"
 } /* extern "C" */
 
+#include "intern/builder/deg_builder.h"
 #include "intern/eval/deg_eval_flush.h"
-
 #include "intern/nodes/deg_node.h"
 #include "intern/nodes/deg_node_component.h"
 #include "intern/nodes/deg_node_operation.h"
@@ -346,6 +346,27 @@ void DEG_graph_on_visible_update(Main *bmain, Scene *scene)
 		GHASH_FOREACH_END();
 	}
 	scene->lay_updated |= graph->layers;
+	/* Special trick to get local view to work.  */
+	LINKLIST_FOREACH (Base *, base, &scene->base) {
+		Object *object = base->object;
+		DEG::IDDepsNode *id_node = graph->find_id_node(&object->id);
+		id_node->layers = 0;
+	}
+	LINKLIST_FOREACH (Base *, base, &scene->base) {
+		Object *object = base->object;
+		DEG::IDDepsNode *id_node = graph->find_id_node(&object->id);
+		id_node->layers |= base->lay;
+	}
+	DEG::deg_graph_build_flush_layers(graph);
+	LINKLIST_FOREACH (Base *, base, &scene->base) {
+		Object *object = base->object;
+		DEG::IDDepsNode *id_node = graph->find_id_node(&object->id);
+		GHASH_FOREACH_BEGIN(DEG::ComponentDepsNode *, comp, id_node->components)
+		{
+			id_node->layers |= comp->layers;
+		}
+		GHASH_FOREACH_END();
+	}
 }
 
 void DEG_on_visible_update(Main *bmain, const bool UNUSED(do_time))
diff --git a/source/blender/depsgraph/intern/depsgraph_types.h b/source/blender/depsgraph/intern/depsgraph_types.h
index effd34a0eb9..c9c4329769d 100644
--- a/source/blender/depsgraph/intern/depsgraph_types.h
+++ b/source/blender/depsgraph/intern/depsgraph_types.h
@@ -44,9 +44,6 @@
 #include <string>
 #include <vector>
 
-using std::string;
-using std::vector;
-
 struct bAction;
 struct ChannelDriver;
 struct ModifierData;
@@ -56,6 +53,9 @@ struct FCurve;
 
 namespace DEG {
 
+using std::string;
+using std::vector;
+
 /* Evaluation Operation for atomic operation */
 // XXX: move this to another header that can be exposed?
 typedef function<void(struct EvaluationContext *)> DepsEvalOperationCb;
@@ -89,52 +89,52 @@ typedef enum eDepsNode_Type {
 	/* **** Generic Types **** */
 
 	/* "Current Scene" - basically whatever kicks off the evaluation process. */
-	DEPSNODE_TYPE_ROOT             = 1,
+	DEPSNODE_TYPE_ROOT,
 	/* Time-Source */
-	DEPSNODE_TYPE_TIMESOURCE       = 2,
+	DEPSNODE_TYPE_TIMESOURCE,
 	/* ID-Block reference - used as landmarks/collection point for components,
 	 * but not usually part of main graph.
 	 */
-	DEPSNODE_TYPE_ID_REF           = 3,
+	DEPSNODE_TYPE_ID_REF,
 	/* Isolated sub-graph - used for keeping instanced data separate from
 	 * instances using them.
 	 */
-	DEPSNODE_TYPE_SUBGRAPH         = 4,
+	DEPSNODE_TYPE_SUBGRAPH,
 
 	/* **** Outer Types **** */
 
 	/* Parameters Component - Default when nothing else fits
 	 * (i.e. just SDNA property setting).
 	 */
-	DEPSNODE_TYPE_PARAMETERS       = 11,
+	DEPSNODE_TYPE_PARAMETERS,
 	/* Generic "Proxy-Inherit" Component
 	 * XXX: Also for instancing of subgraphs?
 	 */
-	DEPSNODE_TYPE_PROXY            = 12,
+	DEPSNODE_TYPE_PROXY,
 	/* Animation Component
 	 *
 	 * XXX: merge in with parameters?
 	 */
-	DEPSNODE_TYPE_ANIMATION        = 13,
+	DEPSNODE_TYPE_ANIMATION,
 	/* Transform Component (Parenting/Constraints) */
-	DEPSNODE_TYPE_TRANSFORM        = 14,
+	DEPSNODE_TYPE_TRANSFORM,
 	/* Geometry Component (DerivedMesh/Displist) */
-	DEPSNODE_TYPE_GEOMETRY         = 15,
+	DEPSNODE_TYPE_GEOMETRY,
 	/* Sequencer Component (Scene Only) */
-	DEPSNODE_TYPE_SEQUENCER        = 16,
+	DEPSNODE_TYPE_SEQUENCER,
 
 	/* **** Evaluation-Related Outer Types (with Subdata) **** */
 
 	/* Pose Component - Owner/Container of Bones Eval */
-	DEPSNODE_TYPE_EVAL_POSE        = 21,
+	DEPSNODE_TYPE_EVAL_POSE,
 	/* Bone Component - Child/Subcomponent of Pose */
-	DEPSNODE_TYPE_BONE             = 22,
+	DEPSNODE_TYPE_BONE,
 	/* Particle Systems Component */
-	DEPSNODE_TYPE_EVAL_PARTICLES   = 23,
+	DEPSNODE_TYPE_EVAL_PARTICLES,
 	/* Material Shading Component */
-	DEPSNODE_TYPE_SHADING          = 24,
+	DEPSNODE_TYPE_SHADING,
 	/* Cache Component */
-	DEPSNODE_TYPE_CACHE            = 25,
+	DEPSNODE_TYPE_CACHE,
 } eDepsNode_Type;
 
 /* Identifiers for common operations (as an enum). */
diff --git a/source/blender/depsgraph/intern/eval/deg_eval.cc b/source/blender/depsgraph/intern/eval/deg_eval.cc
index 3a042535d26..e739bc9dbb5 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval.cc
+++ b/source/blender/depsgraph/intern/eval/deg_eval.cc
@@ -95,105 +95,38 @@ static void deg_task_run_func(TaskPool *pool,
 	/* Should only be the case for NOOPs, which never get to this point. */
 	BLI_assert(node->evaluate);
 
-	while (true) {
-		/* Get context. */
-		/* TODO: Who initialises this? "Init" operations aren't able to
-		 * initialise it!!!
-		 */
-		/* TODO(sergey): We don't use component contexts at this moment. */
-		/* ComponentDepsNode *comp = node->owner; */
-		BLI_assert(node->owner != NULL);
-
-		/* Since we're not leaving the thread for until the graph branches it is
-		 * possible to have NO-OP on the way. for which evaluate() will be NULL.
-		 * but that's all fine, we'll just scheduler it's children.
-		 */
-		if (node->evaluate) {
+	/* Get context. */
+	/* TODO: Who initialises this? "Init" operations aren't able to
+	 * initialise it!!!
+	 */
+	/* TODO(sergey): We don't use component contexts at this moment. */
+	/* ComponentDepsNode *comp = node->owner; */
+	BLI_assert(node->owner != NULL);
+
+	/* Since we're not leaving the thread for until the graph branches it is
+	 * possible to have NO-OP on the way. for which evaluate() will be NULL.
+	 * but that's all fine, we'll just scheduler it's children.
+	 */
+	if (node->evaluate) {
 			/* Take note of current time. */
 #ifdef USE_DEBUGGER
-			double start_time = PIL_check_seconds_timer();
-			DepsgraphDebug::task_started(state->graph, node);
+		double start_time = PIL_check_seconds_timer();
+		DepsgraphDebug::task_started(state->graph, node);
 #endif
 
-			/* Perform operation. */
-			node->evaluate(state->eval_ctx);
+		/* Perform operation. */
+		node->evaluate(state->eval_ctx);
 
 			/* Note how long this took. */
 #ifdef USE_DEBUGGER
-			double end_time = PIL_check_seconds_timer();
-			DepsgraphDebug::task_completed(state->graph,
-			                               node,
-			                               end_time - start_time);
+		double end_time = PIL_check_seconds_timer();
+		DepsgraphDebug::task_completed(state->graph,
+		                               node,
+		                               end_time - start_time);
 #endif
-		}
-
-		/* If there's only one outgoing link we try to immediately switch to
-		 * that node evaluation, without leaving the thread.
-		 *
-		 * It's only doable if the child don't have extra relations or all they
-		 * are satisfied.
-		 *
-		 * TODO(sergey): Checks here can be de-duplicated with the ones from
-		 * schedule_node(), however, how to do it nicely?
-		 */
-		if (node->outlinks.size() == 1) {
-			DepsRelation *rel = node->outlinks[0];
-			OperationDepsNode *child = (OperationDepsNode *)rel->to;
-			BLI_assert(child->type == DEPSNODE_TYPE_OPERATION);
-			if (!child->scheduled) {
-				unsigned int id_layers = child->owner->owner->layers;
-				if (!((child->flag & DEPSOP_FLAG_NEEDS_UPDATE) != 0 &&
-				      (id_layers & state->layers) != 0))
-				{
-					/* Node does not need an update, so can;t continue with the
-					 * chain and need to switch to another one by leaving the
-					 * thread.
-					 */
-					break;
-				}
-				if ((rel->flag & DEPSREL_FLAG_CYCLIC) == 0) {
-					BLI_assert(child->num_links_pending > 0);
-					atomic_sub_and_fetch_uint32(&child->num_links_pending, 1);
-				}
-				if (child->num_links_pending == 0) {
-					bool is_scheduled = atomic_fetch_and_or_uint8(
-					        (uint8_t *)&child->scheduled, (uint8_t)true);
-					if (!is_scheduled) {
-						/* Node was not scheduled, switch to it! */
-						node = child;
-					}
-					else {
-						/* Someone else scheduled the node, leaving us
-						 * unemployed in this thread, we're leaving.
-						 */
-						break;
-					}
-				}
-				else {
-					/* There are other dependencies on the child, can't do
-					 * anything in the current thread.
-					 */
-					break;
-				}
-			}
-			else {
-				/* Happens when having cyclic dependencies.
-				 *
-				 * Nothing to do here, single child was already scheduled, we
-				 * can leave the thread now.
-				 */
-				break;
-			}
-		}
-		else {
-			/* TODO(sergey): It's possible to use one of the outgoing relations
-			 * as a chain which we'll try to keep alive, but it's a bit more
-			 * involved change.
-			 */
-			schedule_children(pool, state->graph, node, state->layers, thread_id);
-			break;
-		}
 	}
+
+	schedule_children(pool, state->graph, node, state->layers, thread_id);
 }
 
 typedef struct CalculatePengindData {
@@ -378,12 +311,19 @@ void deg_evaluate_on_refresh(EvaluationContext *eval_ctx,
 	state.graph = graph;
 	state.layers = layers;
 
-	TaskScheduler *task_scheduler = BLI_task_scheduler_get();
-	TaskPool *task_pool = BLI_task_pool_create(task_scheduler, &state);
+	TaskScheduler *task_scheduler;
+	bool need_free_scheduler;
 
 	if (G.debug & G_DEBUG_DEPSGRAPH_NO_THREADS) {
-		BLI_pool_set_num_threads(task_pool, 1);
+		task_scheduler = BLI_task_scheduler_create(1);
+		need_free_scheduler = true;
 	}
+	else {
+		task_scheduler = BLI_task_scheduler_get();
+		need_free_scheduler = false;
+	}
+
+	TaskPool *task_pool = BLI_task_pool_create_suspended(task_scheduler, &state);
 
 	calculate_pending_parents(graph, layers);
 
@@ -410,6 +350,10 @@ void deg_evaluate_on_refresh(EvaluationContext *eval_ctx,
 
 	/* Clear any uncleared tags - just in case. */
 	deg_graph_clear_tags(graph);
+
+	if (need_free_scheduler) {
+		BLI_task_scheduler_free(task_scheduler);
+	}
 }
 
 }  // namespace DEG
diff --git a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
index 7c6c25bef0d..e10f86f6e95 100644
--- a/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
+++ b/source/blender/depsgraph/intern/eval/deg_eval_flush.cc
@@ -180,6 +180,11 @@ void deg_graph_flush_updates(Main *bmain, Depsgraph *graph)
 			comp_node->done = 1;
 
 			/* Flush to nodes along links... */
+			/* TODO(sergey): This is mainly giving speedup due ot less queue pushes, which
+			 * reduces number of memory allocations.
+			 *
+			 * We should try solve the allocation issue instead of doing crazy things here.
+			 */
 			if (node->outlinks.size() == 1) {
 				OperationDepsNode *to_node = (OperationDepsNode *)node->outlinks[0]->to;
 				if (to_node->scheduled == false) {
diff --git a/source/blender/editors/animation/anim_channels_defines.c b/source/blender/editors/animation/anim_channels_defines.c
index 57302c18a88..5852ee72b19 100644
--- a/source/blender/editors/animation/anim_channels_defines.c
+++ b/source/blender/editors/animation/anim_channels_defines.c
@@ -2775,7 +2775,7 @@ static bAnimChannelType ACF_DSMCLIP =
 	acf_generic_indention_1,         /* indent level */
 	acf_generic_basic_offset,        /* offset */
 
-	acf_generic_idblock_name ,       /* name */
+	acf_generic_idblock_name,        /* name */
 	acf_generic_idfill_name_prop,    /* name prop */
 	acf_dsmclip_icon,                /* icon */
 
@@ -3856,7 +3856,8 @@ void ANIM_channel_draw(bAnimContext *ac, bAnimListElem *ale, float yminc, float
 	if (ac->sl) {
 		if ((ac->spacetype == SPACE_IPO) &&
 		    (acf->has_setting(ac, ale, ACHANNEL_SETTING_VISIBLE) ||
-		     acf->has_setting(ac, ale, ACHANNEL_SETTING_ALWAYS_VISIBLE))) {
+		     acf->has_setting(ac, ale, ACHANNEL_SETTING_ALWAYS_VISIBLE)))
+		{
 			/* for F-Curves, draw color-preview of curve behind checkbox */
 			if (ELEM(ale->type, ANIMTYPE_FCURVE, ANIMTYPE_NLACURVE)) {
 				FCurve *fcu = (FCurve *)ale->data;
diff --git a/source/blender/editors/animation/anim_draw.c b/source/blender/editors/animation/anim_draw.c
index 33e44d73894..98900812bb2 100644
--- a/source/blender/editors/animation/anim_draw.c
+++ b/source/blender/editors/animation/anim_draw.c
@@ -46,6 +46,8 @@
 #include "BLI_dlrbTree.h"
 
 #include "BKE_context.h"
+#include "BKE_curve.h"
+#include "BKE_fcurve.h"
 #include "BKE_global.h"
 #include "BKE_nla.h"
 #include "BKE_mask.h"
@@ -115,7 +117,8 @@ void ANIM_draw_cfra(const bContext *C, View2D *v2d, short flag)
 	/* Draw a light green line to indicate current frame */
 	UI_ThemeColor(TH_CFRAME);
 
-	const float x = (float)(scene->r.cfra * scene->r.framelen);
+	const float time = scene->r.cfra + scene->r.subframe;
+	const float x = (float)(time * scene->r.framelen);
 
 	glLineWidth((flag & DRAWCFRA_WIDE) ? 3.0 : 2.0);
 
@@ -308,7 +311,8 @@ static float normalization_factor_get(Scene *scene, FCurve *fcu, short flag, flo
 
 	fcu->prev_norm_factor = 1.0f;
 	if (fcu->bezt) {
-		BezTriple *bezt;
+		const bool use_preview_only = PRVRANGEON;
+		const BezTriple *bezt;
 		int i;
 		float max_coord = -FLT_MAX;
 		float min_coord = FLT_MAX;
@@ -318,28 +322,77 @@ static float normalization_factor_get(Scene *scene, FCurve *fcu, short flag, flo
 			return 1.0f;
 		}
 
-		if (PRVRANGEON) {
-			for (i = 0, bezt = fcu->bezt; i < fcu->totvert; i++, bezt++) {
-				if (IN_RANGE_INCL(bezt->vec[1][0], scene->r.psfra, scene->r.pefra)) {
-					max_coord = max_ff(max_coord, bezt->vec[0][1]);
-					max_coord = max_ff(max_coord, bezt->vec[1][1]);
-					max_coord = max_ff(max_coord, bezt->vec[2][1]);
-
-					min_coord = min_ff(min_coord, bezt->vec[0][1]);
-					min_coord = min_ff(min_coord, bezt->vec[1][1]);
-					min_coord = min_ff(min_coord, bezt->vec[2][1]);
-				}
+		for (i = 0, bezt = fcu->bezt; i < fcu->totvert; i++, bezt++) {
+			if (use_preview_only && !IN_RANGE_INCL(bezt->vec[1][0],
+			                                       scene->r.psfra,
+			                                       scene->r.pefra))
+			{
+				continue;
 			}
-		}
-		else {
-			for (i = 0, bezt = fcu->bezt; i < fcu->totvert; i++, bezt++) {
-				max_coord = max_ff(max_coord, bezt->vec[0][1]);
+
+			if (i == 0) {
+				/* We ignore extrapolation flags and handle here, and use the
+				 * control point position only. so we normalize "interesting"
+				 * part of the curve.
+				 *
+				 * Here we handle left extrapolation.
+				 */
 				max_coord = max_ff(max_coord, bezt->vec[1][1]);
-				max_coord = max_ff(max_coord, bezt->vec[2][1]);
 
-				min_coord = min_ff(min_coord, bezt->vec[0][1]);
 				min_coord = min_ff(min_coord, bezt->vec[1][1]);
-				min_coord = min_ff(min_coord, bezt->vec[2][1]);
+			}
+			else {
+				const BezTriple *prev_bezt = bezt - 1;
+				if (prev_bezt->ipo == BEZT_IPO_CONST) {
+					/* Constant interpolation: previous CV value is used up
+					 * to the current keyframe.
+					 */
+					max_coord = max_ff(max_coord, bezt->vec[1][1]);
+					min_coord = min_ff(min_coord, bezt->vec[1][1]);
+				}
+				else if (prev_bezt->ipo == BEZT_IPO_LIN) {
+					/* Linear interpolation: min/max using both previous and
+					 * and current CV.
+					 */
+					max_coord = max_ff(max_coord, bezt->vec[1][1]);
+					min_coord = min_ff(min_coord, bezt->vec[1][1]);
+					max_coord = max_ff(max_coord, prev_bezt->vec[1][1]);
+					min_coord = min_ff(min_coord, prev_bezt->vec[1][1]);
+				}
+				else if (prev_bezt->ipo == BEZT_IPO_BEZ) {
+					const int resol = fcu->driver
+					        ? 32
+					        : min_ii((int)(5.0f * len_v2v2(bezt->vec[1], prev_bezt->vec[1])), 32);
+					if (resol < 2) {
+						max_coord = max_ff(max_coord, prev_bezt->vec[1][1]);
+						min_coord = min_ff(min_coord, prev_bezt->vec[1][1]);
+					}
+					else {
+						float data[120];
+						float v1[2], v2[2], v3[2], v4[2];
+
+						v1[0] = prev_bezt->vec[1][0];
+						v1[1] = prev_bezt->vec[1][1];
+						v2[0] = prev_bezt->vec[2][0];
+						v2[1] = prev_bezt->vec[2][1];
+
+						v3[0] = bezt->vec[0][0];
+						v3[1] = bezt->vec[0][1];
+						v4[0] = bezt->vec[1][0];
+						v4[1] = bezt->vec[1][1];
+
+						correct_bezpart(v1, v2, v3, v4);
+
+						BKE_curve_forward_diff_bezier(v1[0], v2[0], v3[0], v4[0], data, resol, sizeof(float) * 3);
+						BKE_curve_forward_diff_bezier(v1[1], v2[1], v3[1], v4[1], data + 1, resol, sizeof(float) * 3);
+
+						for (int j = 0; j <= resol; ++j) {
+							const float *fp = &data[j * 3];
+							max_coord = max_ff(max_coord, fp[1]);
+							min_coord = min_ff(min_coord, fp[1]);
+						}
+					}
+				}
 			}
 		}
 
diff --git a/source/blender/editors/animation/anim_ops.c b/source/blender/editors/animation/anim_ops.c
index c0d6963acbb..0eb6508f7b2 100644
--- a/source/blender/editors/animation/anim_ops.c
+++ b/source/blender/editors/animation/anim_ops.c
@@ -95,7 +95,7 @@ static void change_frame_apply(bContext *C, wmOperator *op)
 {
 	Main *bmain = CTX_data_main(C);
 	Scene *scene = CTX_data_scene(C);
-	int frame = RNA_int_get(op->ptr, "frame");
+	float frame = RNA_float_get(op->ptr, "frame");
 	bool do_snap = RNA_boolean_get(op->ptr, "snap");
 
 	if (do_snap && CTX_wm_space_seq(C)) {
@@ -103,10 +103,16 @@ static void change_frame_apply(bContext *C, wmOperator *op)
 	}
 
 	/* set the new frame number */
-	CFRA = frame;
+	if (scene->r.flag & SCER_SHOW_SUBFRAME) {
+		CFRA = (int)frame;
+		SUBFRA = frame - (int)frame;
+	}
+	else {
+		CFRA = iroundf(frame);
+		SUBFRA = 0.0f;
+	}
 	FRAMENUMBER_MIN_CLAMP(CFRA);
-	SUBFRA = 0.0f;
-	
+
 	/* do updates */
 	BKE_sound_seek_scene(bmain, scene);
 	WM_event_add_notifier(C, NC_SCENE | ND_FRAME, scene);
@@ -125,19 +131,16 @@ static int change_frame_exec(bContext *C, wmOperator *op)
 /* ---- */
 
 /* Get frame from mouse coordinates */
-static int frame_from_event(bContext *C, const wmEvent *event)
+static float frame_from_event(bContext *C, const wmEvent *event)
 {
 	ARegion *region = CTX_wm_region(C);
 	Scene *scene = CTX_data_scene(C);
-	float viewx;
-	int frame;
+	float frame;
 
 	/* convert from region coordinates to View2D 'tot' space */
-	viewx = UI_view2d_region_to_view_x(&region->v2d, event->mval[0]);
-	
-	/* round result to nearest int (frames are ints!) */
-	frame = iroundf(viewx);
+	frame = UI_view2d_region_to_view_x(&region->v2d, event->mval[0]);
 	
+	/* respect preview range restrictions (if only allowed to move around within that range) */
 	if (scene->r.flag & SCER_LOCK_FRAME_SELECTION) {
 		CLAMP(frame, PSFRA, PEFRA);
 	}
@@ -187,7 +190,7 @@ static int change_frame_invoke(bContext *C, wmOperator *op, const wmEvent *event
 	 * as user could click on a single frame (jump to frame) as well as
 	 * click-dragging over a range (modal scrubbing).
 	 */
-	RNA_int_set(op->ptr, "frame", frame_from_event(C, event));
+	RNA_float_set(op->ptr, "frame", frame_from_event(C, event));
 
 	change_frame_seq_preview_begin(C, event);
 
@@ -215,7 +218,7 @@ static int change_frame_modal(bContext *C, wmOperator *op, const wmEvent *event)
 			break;
 
 		case MOUSEMOVE:
-			RNA_int_set(op->ptr, "frame", frame_from_event(C, event));
+			RNA_float_set(op->ptr, "frame", frame_from_event(C, event));
 			change_frame_apply(C, op);
 			break;
 		
@@ -268,7 +271,7 @@ static void ANIM_OT_change_frame(wmOperatorType *ot)
 	ot->undo_group = "FRAME_CHANGE";
 
 	/* rna */
-	ot->prop = RNA_def_int(ot->srna, "frame", 0, MINAFRAME, MAXFRAME, "Frame", "", MINAFRAME, MAXFRAME);
+	ot->prop = RNA_def_float(ot->srna, "frame", 0, MINAFRAME, MAXFRAME, "Frame", "", MINAFRAME, MAXFRAME);
 	prop = RNA_def_boolean(ot->srna, "snap", false, "Snap", "");
 	RNA_def_property_flag(prop, PROP_SKIP_SAVE);
 }
diff --git a/source/blender/editors/armature/armature_intern.h b/source/blender/editors/armature/armature_intern.h
index b39b4bd81ee..190b0610059 100644
--- a/source/blender/editors/armature/armature_intern.h
+++ b/source/blender/editors/armature/armature_intern.h
@@ -247,8 +247,10 @@ void armature_select_mirrored_ex(struct bArmature *arm, const int flag);
 void armature_select_mirrored(struct bArmature *arm);
 void armature_tag_unselect(struct bArmature *arm);
 
-void *get_nearest_bone(struct bContext *C, short findunsel, int x, int y);
-void *get_bone_from_selectbuffer(struct Scene *scene, struct Base *base, unsigned int *buffer, short hits, short findunsel, bool do_nearest);
+void *get_nearest_bone(struct bContext *C, const int xy[2], bool findunsel);
+void *get_bone_from_selectbuffer(
+        struct Scene *scene, struct Base *base, const unsigned int *buffer, short hits,
+        bool findunsel, bool do_nearest);
 
 int bone_looper(struct Object *ob, struct Bone *bone, void *data,
                 int (*bone_func)(struct Object *, struct Bone *, void *));
diff --git a/source/blender/editors/armature/armature_naming.c b/source/blender/editors/armature/armature_naming.c
index fa192ed6f36..c928508237d 100644
--- a/source/blender/editors/armature/armature_naming.c
+++ b/source/blender/editors/armature/armature_naming.c
@@ -362,7 +362,7 @@ static int armature_flip_names_exec(bContext *C, wmOperator *UNUSED(op))
 
 	arm = ob->data;
 
-	ListBase bones_names= {NULL};
+	ListBase bones_names = {NULL};
 
 	CTX_DATA_BEGIN(C, EditBone *, ebone, selected_editable_bones)
 	{
diff --git a/source/blender/editors/armature/armature_select.c b/source/blender/editors/armature/armature_select.c
index e9946abba0b..ed44214591c 100644
--- a/source/blender/editors/armature/armature_select.c
+++ b/source/blender/editors/armature/armature_select.c
@@ -74,7 +74,9 @@ Bone *get_indexed_bone(Object *ob, int index)
 
 /* See if there are any selected bones in this buffer */
 /* only bones from base are checked on */
-void *get_bone_from_selectbuffer(Scene *scene, Base *base, unsigned int *buffer, short hits, short findunsel, bool do_nearest)
+void *get_bone_from_selectbuffer(
+        Scene *scene, Base *base, const unsigned int *buffer, short hits,
+        bool findunsel, bool do_nearest)
 {
 	Object *obedit = scene->obedit; // XXX get from context
 	Bone *bone;
@@ -103,8 +105,8 @@ void *get_bone_from_selectbuffer(Scene *scene, Base *base, unsigned int *buffer,
 							sel = (bone->flag & BONE_SELECTED);
 						else
 							sel = !(bone->flag & BONE_SELECTED);
-						
-						data = bone;						
+
+						data = bone;
 					}
 					else {
 						data = NULL;
@@ -162,7 +164,7 @@ void *get_bone_from_selectbuffer(Scene *scene, Base *base, unsigned int *buffer,
 /* used by posemode as well editmode */
 /* only checks scene->basact! */
 /* x and y are mouse coords (area space) */
-void *get_nearest_bone(bContext *C, short findunsel, int x, int y)
+void *get_nearest_bone(bContext *C, const int xy[2], bool findunsel)
 {
 	ViewContext vc;
 	rcti rect;
@@ -172,10 +174,10 @@ void *get_nearest_bone(bContext *C, short findunsel, int x, int y)
 	view3d_set_viewcontext(C, &vc);
 	
 	// rect.xmin = ... mouseco!
-	rect.xmin = rect.xmax = x;
-	rect.ymin = rect.ymax = y;
+	rect.xmin = rect.xmax = xy[0];
+	rect.ymin = rect.ymax = xy[1];
 	
-	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true);
+	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, VIEW3D_SELECT_PICK_NEAREST);
 
 	if (hits > 0)
 		return get_bone_from_selectbuffer(vc.scene, vc.scene->basact, buffer, hits, findunsel, true);
@@ -197,10 +199,7 @@ static int armature_select_linked_invoke(bContext *C, wmOperator *op, const wmEv
 
 	view3d_operator_needs_opengl(C);
 
-	if (extend)
-		bone = get_nearest_bone(C, 0, event->mval[0], event->mval[1]);
-	else
-		bone = get_nearest_bone(C, 1, event->mval[0], event->mval[1]);
+	bone = get_nearest_bone(C, event->mval, !extend);
 
 	if (!bone)
 		return OPERATOR_CANCELLED;
@@ -276,10 +275,24 @@ void ARMATURE_OT_select_linked(wmOperatorType *ot)
 	RNA_def_boolean(ot->srna, "extend", false, "Extend", "Extend selection instead of deselecting everything first");
 }
 
+/* utility function for get_nearest_editbonepoint */
+static int selectbuffer_ret_hits_12(unsigned int *UNUSED(buffer), const int hits12)
+{
+	return hits12;
+}
+
+static int selectbuffer_ret_hits_5(unsigned int *buffer, const int hits12, const int hits5)
+{
+	const int offs = 4 * hits12;
+	memcpy(buffer, buffer + offs, 4 * hits5 * sizeof(unsigned int));
+	return hits5;
+}
+
 /* does bones and points */
 /* note that BONE ROOT only gets drawn for root bones (or without IK) */
-static EditBone *get_nearest_editbonepoint(ViewContext *vc, const int mval[2],
-                                           ListBase *edbo, int findunsel, int *selmask)
+static EditBone *get_nearest_editbonepoint(
+        ViewContext *vc, const int mval[2],
+        ListBase *edbo, bool findunsel, bool use_cycle, int *r_selmask)
 {
 	bArmature *arm = (bArmature *)vc->obedit->data;
 	EditBone *ebone_next_act = arm->act_edbone;
@@ -289,7 +302,9 @@ static EditBone *get_nearest_editbonepoint(ViewContext *vc, const int mval[2],
 	unsigned int buffer[MAXPICKBUF];
 	unsigned int hitresult, besthitresult = BONESEL_NOSEL;
 	int i, mindep = 5;
-	short hits;
+	int hits12, hits5 = 0;
+
+	static int last_mval[2] = {-100, -100};
 
 	/* find the bone after the current active bone, so as to bump up its chances in selection.
 	 * this way overlapping bones will cycle selection state as with objects. */
@@ -303,22 +318,59 @@ static EditBone *get_nearest_editbonepoint(ViewContext *vc, const int mval[2],
 		ebone_next_act = NULL;
 	}
 
-	rect.xmin = mval[0] - 5;
-	rect.xmax = mval[0] + 5;
-	rect.ymin = mval[1] - 5;
-	rect.ymax = mval[1] + 5;
-
-	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, true);
-	if (hits == 0) {
-		rect.xmin = mval[0] - 12;
-		rect.xmax = mval[0] + 12;
-		rect.ymin = mval[1] - 12;
-		rect.ymax = mval[1] + 12;
-		hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, true);
+	bool do_nearest = false;
+
+	/* define if we use solid nearest select or not */
+	if (use_cycle) {
+		if (vc->v3d->drawtype > OB_WIRE) {
+			do_nearest = true;
+			if (len_manhattan_v2v2_int(mval, last_mval) < 3) {
+				do_nearest = false;
+			}
+		}
+		copy_v2_v2_int(last_mval, mval);
+	}
+	else {
+		if (vc->v3d->drawtype > OB_WIRE) {
+			do_nearest = true;
+		}
 	}
+
+	/* matching logic from 'mixed_bones_object_selectbuffer' */
+	const int select_mode = (do_nearest ? VIEW3D_SELECT_PICK_NEAREST : VIEW3D_SELECT_PICK_ALL);
+	int hits = 0;
+
+	/* we _must_ end cache before return, use 'goto cache_end' */
+	view3d_opengl_select_cache_begin();
+
+	BLI_rcti_init_pt_radius(&rect, mval, 12);
+	hits12 = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, select_mode);
+	if (hits12 == 1) {
+		hits = selectbuffer_ret_hits_12(buffer, hits12);
+		goto cache_end;
+	}
+	else if (hits12 > 0) {
+		int offs;
+
+		offs = 4 * hits12;
+		BLI_rcti_init_pt_radius(&rect, mval, 5);
+		hits5 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, select_mode);
+
+		if (hits5 == 1) {
+			hits = selectbuffer_ret_hits_5(buffer, hits12, hits5);
+			goto cache_end;
+		}
+
+		if      (hits5 > 0) { hits = selectbuffer_ret_hits_5(buffer,  hits12, hits5); goto cache_end; }
+		else                { hits = selectbuffer_ret_hits_12(buffer, hits12); goto cache_end; }
+	}
+
+cache_end:
+	view3d_opengl_select_cache_end();
+
 	/* See if there are any selected bones in this group */
 	if (hits > 0) {
-		
+
 		if (hits == 1) {
 			if (!(buffer[3] & BONESEL_NOSEL))
 				besthitresult = buffer[3];
@@ -375,17 +427,17 @@ static EditBone *get_nearest_editbonepoint(ViewContext *vc, const int mval[2],
 			
 			ebone = BLI_findlink(edbo, besthitresult & ~BONESEL_ANY);
 			
-			*selmask = 0;
+			*r_selmask = 0;
 			if (besthitresult & BONESEL_ROOT)
-				*selmask |= BONE_ROOTSEL;
+				*r_selmask |= BONE_ROOTSEL;
 			if (besthitresult & BONESEL_TIP)
-				*selmask |= BONE_TIPSEL;
+				*r_selmask |= BONE_TIPSEL;
 			if (besthitresult & BONESEL_BONE)
-				*selmask |= BONE_SELECTED;
+				*r_selmask |= BONE_SELECTED;
 			return ebone;
 		}
 	}
-	*selmask = 0;
+	*r_selmask = 0;
 	return NULL;
 }
 
@@ -439,8 +491,8 @@ bool ED_armature_select_pick(bContext *C, const int mval[2], bool extend, bool d
 	if (BIF_sk_selectStroke(C, mval, extend)) {
 		return true;
 	}
-	
-	nearBone = get_nearest_editbonepoint(&vc, mval, arm->edbo, 1, &selmask);
+
+	nearBone = get_nearest_editbonepoint(&vc, mval, arm->edbo, true, true, &selmask);
 	if (nearBone) {
 
 		if (!extend && !deselect && !toggle) {
@@ -1202,7 +1254,7 @@ static int armature_shortest_path_pick_invoke(bContext *C, wmOperator *op, const
 	view3d_operator_needs_opengl(C);
 
 	ebone_src = arm->act_edbone;
-	ebone_dst = get_nearest_bone(C, 0, event->mval[0], event->mval[1]);
+	ebone_dst = get_nearest_bone(C, event->mval, false);
 
 	/* fallback to object selection */
 	if (ELEM(NULL, ebone_src, ebone_dst) || (ebone_src == ebone_dst)) {
diff --git a/source/blender/editors/armature/editarmature_sketch.c b/source/blender/editors/armature/editarmature_sketch.c
index f6c04e9570a..bba486bc65c 100644
--- a/source/blender/editors/armature/editarmature_sketch.c
+++ b/source/blender/editors/armature/editarmature_sketch.c
@@ -1907,12 +1907,9 @@ static bool sk_selectStroke(bContext *C, SK_Sketch *sketch, const int mval[2], c
 
 	view3d_set_viewcontext(C, &vc);
 
-	rect.xmin = mval[0] - 5;
-	rect.xmax = mval[0] + 5;
-	rect.ymin = mval[1] - 5;
-	rect.ymax = mval[1] + 5;
+	BLI_rcti_init_pt_radius(&rect, mval, 5);
 
-	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true);
+	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, VIEW3D_SELECT_PICK_NEAREST);
 
 	if (hits > 0) {
 		int besthitresult = -1;
diff --git a/source/blender/editors/armature/pose_select.c b/source/blender/editors/armature/pose_select.c
index 44470c1f827..6e328552411 100644
--- a/source/blender/editors/armature/pose_select.c
+++ b/source/blender/editors/armature/pose_select.c
@@ -132,8 +132,9 @@ void ED_pose_bone_select(Object *ob, bPoseChannel *pchan, bool select)
 
 /* called from editview.c, for mode-less pose selection */
 /* assumes scene obact and basact is still on old situation */
-int ED_do_pose_selectbuffer(Scene *scene, Base *base, unsigned int *buffer, short hits,
-                            bool extend, bool deselect, bool toggle, bool do_nearest)
+bool ED_do_pose_selectbuffer(
+        Scene *scene, Base *base, const unsigned int *buffer, short hits,
+        bool extend, bool deselect, bool toggle, bool do_nearest)
 {
 	Object *ob = base->object;
 	Bone *nearBone;
@@ -280,12 +281,9 @@ static int pose_select_connected_invoke(bContext *C, wmOperator *op, const wmEve
 	const bool extend = RNA_boolean_get(op->ptr, "extend");
 
 	view3d_operator_needs_opengl(C);
-	
-	if (extend)
-		bone = get_nearest_bone(C, 0, event->mval[0], event->mval[1]);
-	else
-		bone = get_nearest_bone(C, 1, event->mval[0], event->mval[1]);
-	
+
+	bone = get_nearest_bone(C, event->mval, !extend);
+
 	if (!bone)
 		return OPERATOR_CANCELLED;
 	
diff --git a/source/blender/editors/armature/pose_slide.c b/source/blender/editors/armature/pose_slide.c
index 8e8345d34c9..21cb405c32b 100644
--- a/source/blender/editors/armature/pose_slide.c
+++ b/source/blender/editors/armature/pose_slide.c
@@ -98,8 +98,11 @@ typedef struct tPoseSlideOp {
 	int prevFrame;      /* frame before current frame (blend-from) */
 	int nextFrame;      /* frame after current frame (blend-to) */
 	
-	int mode;           /* sliding mode (ePoseSlide_Modes) */
-	int flag;           /* unused for now, but can later get used for storing runtime settings.... */
+	short mode;         /* sliding mode (ePoseSlide_Modes) */
+	short flag;         /* unused for now, but can later get used for storing runtime settings.... */
+	
+	short channels;     /* which transforms/channels are affected (ePoseSlide_Channels) */
+	short axislock;     /* axis-limits for transforms (ePoseSlide_AxisLock) */
 	
 	float percentage;   /* 0-1 value for determining the influence of whatever is relevant */
 	
@@ -113,6 +116,49 @@ typedef enum ePoseSlide_Modes {
 	POSESLIDE_BREAKDOWN,        /* slide between the endpoint poses, finding a 'soft' spot */
 } ePoseSlide_Modes;
 
+
+/* Transforms/Channels to Affect */
+typedef enum ePoseSlide_Channels {
+	PS_TFM_ALL = 0,              /* All transforms and properties */
+	
+	PS_TFM_LOC,                  /* Loc/Rot/Scale */
+	PS_TFM_ROT,
+	PS_TFM_SIZE,
+	
+	PS_TFM_BBONE_SHAPE,          /* Bendy Bones */
+	
+	PS_TFM_PROPS                 /* Custom Properties */
+} ePoseSlide_Channels;
+
+/* Property enum for ePoseSlide_Channels */
+static EnumPropertyItem prop_channels_types[] = {
+	{PS_TFM_ALL, "ALL", 0, "All Properties", 
+	 "All properties, including transforms, bendy bone shape, and custom properties"},
+	{PS_TFM_LOC, "LOC", 0, "Location", "Location only"},
+	{PS_TFM_ROT, "ROT", 0, "Rotation", "Rotation only"},
+	{PS_TFM_SIZE, "SIZE", 0, "Scale", "Scale only"},
+	{PS_TFM_BBONE_SHAPE, "BBONE", 0, "Bendy Bone", "Bendy Bone shape properties"},
+	{PS_TFM_PROPS, "CUSTOM", 0, "Custom Properties", "Custom properties"},
+	{0, NULL, 0, NULL, NULL}
+};
+
+/* Axis Locks */
+typedef enum ePoseSlide_AxisLock {
+	PS_LOCK_X = (1 << 0),
+	PS_LOCK_Y = (1 << 1),
+	PS_LOCK_Z = (1 << 2)
+} ePoseSlide_AxisLock;
+
+/* Property enum for ePoseSlide_AxisLock */
+static EnumPropertyItem prop_axis_lock_types[] = {
+	{0, "FREE", 0, "Free", "All axes are affected"},
+	{PS_LOCK_X, "X", 0, "X", "Only X-axis transforms are affected"},
+	{PS_LOCK_Y, "Y", 0, "Y", "Only Y-axis transforms are affected"},
+	{PS_LOCK_Z, "Z", 0, "Z", "Only Z-axis transforms are affected"},
+	/* TODO: Combinations? */
+	{0, NULL, 0, NULL, NULL}
+};
+
 /* ------------------------------------ */
 
 /* operator init */
@@ -139,6 +185,10 @@ static int pose_slide_init(bContext *C, wmOperator *op, short mode)
 	pso->prevFrame = RNA_int_get(op->ptr, "prev_frame");
 	pso->nextFrame = RNA_int_get(op->ptr, "next_frame");
 	
+	/* get the set of properties/axes that can be operated on */
+	pso->channels = RNA_enum_get(op->ptr, "channels");
+	pso->axislock = RNA_enum_get(op->ptr, "axis_lock");
+	
 	/* check the settings from the context */
 	if (ELEM(NULL, pso->ob, pso->arm, pso->ob->adt, pso->ob->adt->action))
 		return 0;
@@ -293,10 +343,20 @@ static void pose_slide_apply_vec3(tPoseSlideOp *pso, tPChanFCurveLink *pfl, floa
 	/* using this path, find each matching F-Curve for the variables we're interested in */
 	while ( (ld = poseAnim_mapping_getNextFCurve(&pfl->fcurves, ld, path)) ) {
 		FCurve *fcu = (FCurve *)ld->data;
-
-		/* just work on these channels one by one... there's no interaction between values */
+		const int idx  = fcu->array_index;
+		const int lock = pso->axislock;
+		
+		/* check if this F-Curve is ok given the current axis locks */
 		BLI_assert(fcu->array_index < 3);
-		pose_slide_apply_val(pso, fcu, &vec[fcu->array_index]);
+		
+		if ((lock == 0) ||
+		    ((lock & PS_LOCK_X) && (idx == 0)) ||
+		    ((lock & PS_LOCK_Y) && (idx == 1)) ||
+		    ((lock & PS_LOCK_Z) && (idx == 2)))
+		{
+			/* just work on these channels one by one... there's no interaction between values */
+			pose_slide_apply_val(pso, fcu, &vec[fcu->array_index]);
+		}
 	}
 	
 	/* free the temp path we got */
@@ -494,17 +554,17 @@ static void pose_slide_apply(bContext *C, tPoseSlideOp *pso)
 		 */
 		bPoseChannel *pchan = pfl->pchan;
 		 
-		if (pchan->flag & POSE_LOC) {
+		if (ELEM(pso->channels, PS_TFM_ALL, PS_TFM_LOC) && (pchan->flag & POSE_LOC)) {
 			/* calculate these for the 'location' vector, and use location curves */
 			pose_slide_apply_vec3(pso, pfl, pchan->loc, "location");
 		}
 		
-		if (pchan->flag & POSE_SIZE) {
+		if (ELEM(pso->channels, PS_TFM_ALL, PS_TFM_SIZE) && (pchan->flag & POSE_SIZE)) {
 			/* calculate these for the 'scale' vector, and use scale curves */
 			pose_slide_apply_vec3(pso, pfl, pchan->size, "scale");
 		}
 		
-		if (pchan->flag & POSE_ROT) {
+		if (ELEM(pso->channels, PS_TFM_ALL, PS_TFM_ROT) && (pchan->flag & POSE_ROT)) {
 			/* everything depends on the rotation mode */
 			if (pchan->rotmode > 0) {
 				/* eulers - so calculate these for the 'eul' vector, and use euler_rotation curves */
@@ -519,12 +579,12 @@ static void pose_slide_apply(bContext *C, tPoseSlideOp *pso)
 			}
 		}
 		
-		if (pchan->flag & POSE_BBONE_SHAPE) {
+		if (ELEM(pso->channels, PS_TFM_ALL, PS_TFM_BBONE_SHAPE) && (pchan->flag & POSE_BBONE_SHAPE)) {
 			/* bbone properties - they all start a "bbone_" prefix */
 			pose_slide_apply_props(pso, pfl, "bbone_"); 
 		}
 		
-		if (pfl->oldprops) {
+		if (ELEM(pso->channels, PS_TFM_ALL, PS_TFM_PROPS) && (pfl->oldprops)) {
 			/* not strictly a transform, but custom properties contribute to the pose produced in many rigs
 			 * (e.g. the facial rigs used in Sintel)
 			 */
@@ -553,9 +613,12 @@ static void pose_slide_reset(tPoseSlideOp *pso)
 /* ------------------------------------ */
 
 /* draw percentage indicator in header */
+// TODO: Include hints about locks here...
 static void pose_slide_draw_status(tPoseSlideOp *pso)
 {
 	char status_str[UI_MAX_DRAW_STR];
+	char limits_str[UI_MAX_DRAW_STR];
+	char axis_str[50];
 	char mode_str[32];
 	
 	switch (pso->mode) {
@@ -575,16 +638,58 @@ static void pose_slide_draw_status(tPoseSlideOp *pso)
 			break;
 	}
 	
+	switch (pso->axislock) {
+		case PS_LOCK_X:
+			BLI_strncpy(axis_str, "[X]/Y/Z axis only (X to clear)", sizeof(axis_str));
+			break;
+		case PS_LOCK_Y:
+			BLI_strncpy(axis_str, "X/[Y]/Z axis only (Y to clear)", sizeof(axis_str));
+			break;
+		case PS_LOCK_Z:
+			BLI_strncpy(axis_str, "X/Y/[Z] axis only (Z to clear)", sizeof(axis_str));
+			break;
+			
+		default:
+			if (ELEM(pso->channels, PS_TFM_LOC, PS_TFM_ROT, PS_TFM_SIZE)) {
+				BLI_strncpy(axis_str, "X/Y/Z = Axis Constraint", sizeof(axis_str));
+			}
+			else {
+				axis_str[0] = '\0';
+			}
+			break;
+	}
+	
+	switch (pso->channels) {
+		case PS_TFM_LOC:
+			BLI_snprintf(limits_str, sizeof(limits_str), "[G]/R/S/B/C - Location only (G to clear) | %s", axis_str);
+			break;
+		case PS_TFM_ROT:
+			BLI_snprintf(limits_str, sizeof(limits_str), "G/[R]/S/B/C - Rotation only (R to clear) | %s", axis_str);
+			break;
+		case PS_TFM_SIZE:
+			BLI_snprintf(limits_str, sizeof(limits_str), "G/R/[S]/B/C - Scale only (S to clear) | %s", axis_str);
+			break;
+		case PS_TFM_BBONE_SHAPE:
+			BLI_strncpy(limits_str, "G/R/S/[B]/C - Bendy Bone properties only (B to clear) | %s", sizeof(limits_str));
+			break;
+		case PS_TFM_PROPS:
+			BLI_strncpy(limits_str, "G/R/S/B/[C] - Custom Properties only (C to clear) | %s", sizeof(limits_str));
+			break;
+		default:
+			BLI_strncpy(limits_str, "G/R/S/B/C - Limit to Transform/Property Set", sizeof(limits_str));
+			break;
+	}
+	
 	if (hasNumInput(&pso->num)) {
 		Scene *scene = pso->scene;
 		char str_offs[NUM_STR_REP_LEN];
 		
 		outputNumInput(&pso->num, str_offs, &scene->unit);
 		
-		BLI_snprintf(status_str, sizeof(status_str), "%s: %s", mode_str, str_offs);
+		BLI_snprintf(status_str, sizeof(status_str), "%s: %s     |   %s", mode_str, str_offs, limits_str);
 	}
 	else {
-		BLI_snprintf(status_str, sizeof(status_str), "%s: %d %%", mode_str, (int)(pso->percentage * 100.0f));
+		BLI_snprintf(status_str, sizeof(status_str), "%s: %d %%     |   %s", mode_str, (int)(pso->percentage * 100.0f), limits_str);
 	}
 	
 	ED_area_headerprint(pso->sa, status_str);
@@ -675,11 +780,58 @@ static void pose_slide_mouse_update_percentage(tPoseSlideOp *pso, wmOperator *op
 	RNA_float_set(op->ptr, "percentage", pso->percentage);
 }
 
+/* handle an event to toggle channels mode */
+static void pose_slide_toggle_channels_mode(wmOperator *op, tPoseSlideOp *pso, ePoseSlide_Channels channel)
+{
+	/* Turn channel on or off? */
+	if (pso->channels == channel) {
+		/* Already limiting to transform only, so pressing this again turns it off */
+		pso->channels = PS_TFM_ALL;
+	}
+	else {
+		/* Only this set of channels */
+		pso->channels = channel;
+	}
+	RNA_enum_set(op->ptr, "channels", pso->channels);
+	
+	
+	/* Reset axis limits too for good measure */
+	pso->axislock = 0;
+	RNA_enum_set(op->ptr, "axis_lock", pso->axislock);
+}
+
+/* handle an event to toggle axis locks - returns whether any change in state is needed */
+static bool pose_slide_toggle_axis_locks(wmOperator *op, tPoseSlideOp *pso, ePoseSlide_AxisLock axis)
+{
+	/* Axis can only be set when a transform is set - it doesn't make sense otherwise */
+	if (ELEM(pso->channels, PS_TFM_ALL, PS_TFM_BBONE_SHAPE, PS_TFM_PROPS)) {
+		pso->axislock = 0;
+		RNA_enum_set(op->ptr, "axis_lock", pso->axislock);
+		return false;
+	}
+	
+	/* Turn on or off? */
+	if (pso->axislock == axis) {
+		/* Already limiting on this axis, so turn off */
+		pso->axislock = 0;
+	}
+	else {
+		/* Only this axis */
+		pso->axislock = axis;
+	}
+	RNA_enum_set(op->ptr, "axis_lock", pso->axislock);
+	
+	/* Setting changed, so pose update is needed */
+	return true;
+}
+
 /* common code for modal() */
 static int pose_slide_modal(bContext *C, wmOperator *op, const wmEvent *event)
 {
 	tPoseSlideOp *pso = op->customdata;
 	wmWindow *win = CTX_wm_window(C);
+	bool do_pose_update = false;
+	
 	const bool has_numinput = hasNumInput(&pso->num);
 	
 	switch (event->type) {
@@ -718,7 +870,8 @@ static int pose_slide_modal(bContext *C, wmOperator *op, const wmEvent *event)
 			/* canceled! */
 			return OPERATOR_CANCELLED;
 		}
-			
+		
+		/* Percentage Chane... */
 		case MOUSEMOVE: /* calculate new position */
 		{
 			/* only handle mousemove if not doing numinput */
@@ -726,14 +879,8 @@ static int pose_slide_modal(bContext *C, wmOperator *op, const wmEvent *event)
 				/* update percentage based on position of mouse */
 				pose_slide_mouse_update_percentage(pso, op, event);
 				
-				/* update percentage indicator in header */
-				pose_slide_draw_status(pso);
-				
-				/* reset transforms (to avoid accumulation errors) */
-				pose_slide_reset(pso);
-				
-				/* apply... */
-				pose_slide_apply(C, pso);
+				/* update pose to reflect the new values (see below) */
+				do_pose_update = true;
 			}
 			break;
 		}
@@ -751,16 +898,75 @@ static int pose_slide_modal(bContext *C, wmOperator *op, const wmEvent *event)
 				CLAMP(pso->percentage, 0.0f, 1.0f);
 				RNA_float_set(op->ptr, "percentage", pso->percentage);
 				
-				/* update percentage indicator in header */
-				pose_slide_draw_status(pso);
-				
-				/* reset transforms (to avoid accumulation errors) */
-				pose_slide_reset(pso);
-				
-				/* apply... */
-				pose_slide_apply(C, pso);
+				/* Update pose to reflect the new values (see below) */
+				do_pose_update = true;
 				break;
 			}
+			else if (event->val == KM_PRESS) {
+				switch (event->type) {
+					/* Transform Channel Limits  */
+					/* XXX: Replace these hardcoded hotkeys with a modalmap that can be customised */
+					case GKEY: /* Location */
+					{
+						pose_slide_toggle_channels_mode(op, pso, PS_TFM_LOC);
+						do_pose_update = true;
+						break;
+					}
+					case RKEY: /* Rotation */
+					{
+						pose_slide_toggle_channels_mode(op, pso, PS_TFM_ROT);
+						do_pose_update = true;
+						break;
+					}
+					case SKEY: /* Scale */
+					{
+						pose_slide_toggle_channels_mode(op, pso, PS_TFM_SIZE);
+						do_pose_update = true;
+						break;
+					}
+					case BKEY: /* Bendy Bones */
+					{
+						pose_slide_toggle_channels_mode(op, pso, PS_TFM_BBONE_SHAPE);
+						do_pose_update = true;
+						break;
+					}
+					case CKEY: /* Custom Properties */
+					{
+						pose_slide_toggle_channels_mode(op, pso, PS_TFM_PROPS);
+						do_pose_update = true;
+						break;
+					}
+					
+					
+					/* Axis Locks */
+					/* XXX: Hardcoded... */
+					case XKEY:
+					{
+						if (pose_slide_toggle_axis_locks(op, pso, PS_LOCK_X)) {
+							do_pose_update = true;
+						}
+						break;
+					}
+					case YKEY:
+					{
+						if (pose_slide_toggle_axis_locks(op, pso, PS_LOCK_Y)) {
+							do_pose_update = true;
+						}
+						break;
+					}
+					case ZKEY:
+					{
+						if (pose_slide_toggle_axis_locks(op, pso, PS_LOCK_Z)) {
+							do_pose_update = true;
+						}
+						break;
+					}
+					
+					
+					default: /* Some other unhandled key... */
+						break;
+				}
+			}
 			else {
 				/* unhandled event - maybe it was some view manip? */
 				/* allow to pass through */
@@ -768,6 +974,19 @@ static int pose_slide_modal(bContext *C, wmOperator *op, const wmEvent *event)
 			}
 	}
 	
+	
+	/* perform pose updates - in response to some user action (e.g. pressing a key or moving the mouse) */
+	if (do_pose_update) {
+		/* update percentage indicator in header */
+		pose_slide_draw_status(pso);
+		
+		/* reset transforms (to avoid accumulation errors) */
+		pose_slide_reset(pso);
+		
+		/* apply... */
+		pose_slide_apply(C, pso);
+	}
+	
 	/* still running... */
 	return OPERATOR_RUNNING_MODAL;
 }
@@ -795,11 +1014,16 @@ static int pose_slide_exec_common(bContext *C, wmOperator *op, tPoseSlideOp *pso
 }
 
 /* common code for defining RNA properties */
+/* TODO: Skip save on these? */
 static void pose_slide_opdef_properties(wmOperatorType *ot)
 {
+	RNA_def_float_percentage(ot->srna, "percentage", 0.5f, 0.0f, 1.0f, "Percentage", "Weighting factor for which keyframe is favored more", 0.3, 0.7);
+	
 	RNA_def_int(ot->srna, "prev_frame", 0, MINAFRAME, MAXFRAME, "Previous Keyframe", "Frame number of keyframe immediately before the current frame", 0, 50);
 	RNA_def_int(ot->srna, "next_frame", 0, MINAFRAME, MAXFRAME, "Next Keyframe", "Frame number of keyframe immediately after the current frame", 0, 50);
-	RNA_def_float_percentage(ot->srna, "percentage", 0.5f, 0.0f, 1.0f, "Percentage", "Weighting factor for the sliding operation", 0.3, 0.7);
+	
+	RNA_def_enum(ot->srna, "channels", prop_channels_types, PS_TFM_ALL, "Channels", "Set of properties that are affected");
+	RNA_def_enum(ot->srna, "axis_lock", prop_axis_lock_types, 0, "Axis Lock", "Transform axis to restrict effects to");
 }
 
 /* ------------------------------------ */
diff --git a/source/blender/editors/curve/editcurve.c b/source/blender/editors/curve/editcurve.c
index e9fd5fb5a43..47f42ab5321 100644
--- a/source/blender/editors/curve/editcurve.c
+++ b/source/blender/editors/curve/editcurve.c
@@ -91,14 +91,6 @@ typedef struct {
 	int flag;
 } UndoCurve;
 
-/* Definitions needed for shape keys */
-typedef struct {
-	void *orig_cv;
-	int key_index, nu_index, pt_index, vertex_index;
-	bool switched;
-	Nurb *orig_nu;
-} CVKeyIndex;
-
 void selectend_nurb(Object *obedit, enum eEndPoint_Types selfirst, bool doswap, bool selstatus);
 static void adduplicateflagNurb(Object *obedit, ListBase *newnurb, const short flag, const bool split);
 static int curve_delete_segments(Object *obedit, const bool split);
@@ -138,9 +130,9 @@ void printknots(Object *obedit)
 
 /* ********************* Shape keys *************** */
 
-static CVKeyIndex *init_cvKeyIndex(void *cv, int key_index, int nu_index, int pt_index, int vertex_index, Nurb *orig_nu)
+static CVKeyIndex *init_cvKeyIndex(void *cv, int key_index, int nu_index, int pt_index, int vertex_index)
 {
-	CVKeyIndex *cvIndex = MEM_callocN(sizeof(CVKeyIndex), "init_cvKeyIndex");
+	CVKeyIndex *cvIndex = MEM_callocN(sizeof(CVKeyIndex), __func__);
 
 	cvIndex->orig_cv = cv;
 	cvIndex->key_index = key_index;
@@ -148,7 +140,6 @@ static CVKeyIndex *init_cvKeyIndex(void *cv, int key_index, int nu_index, int pt
 	cvIndex->pt_index = pt_index;
 	cvIndex->vertex_index = vertex_index;
 	cvIndex->switched = false;
-	cvIndex->orig_nu = orig_nu;
 
 	return cvIndex;
 }
@@ -174,7 +165,12 @@ static void init_editNurb_keyIndex(EditNurb *editnurb, ListBase *origBase)
 			origbezt = orignu->bezt;
 			pt_index = 0;
 			while (a--) {
-				keyIndex = init_cvKeyIndex(origbezt, key_index, nu_index, pt_index, vertex_index, orignu);
+				/* We cannot keep *any* reference to curve obdata,
+				 * it might be replaced and freed while editcurve remain in use (in viewport render case e.g.).
+				 * Note that we could use a pool to avoid lots of malloc's here, but... not really a problem for now. */
+				BezTriple *origbezt_cpy = MEM_mallocN(sizeof(*origbezt), __func__);
+				*origbezt_cpy = *origbezt;
+				keyIndex = init_cvKeyIndex(origbezt_cpy, key_index, nu_index, pt_index, vertex_index);
 				BLI_ghash_insert(gh, bezt, keyIndex);
 				key_index += 12;
 				vertex_index += 3;
@@ -189,7 +185,12 @@ static void init_editNurb_keyIndex(EditNurb *editnurb, ListBase *origBase)
 			origbp = orignu->bp;
 			pt_index = 0;
 			while (a--) {
-				keyIndex = init_cvKeyIndex(origbp, key_index, nu_index, pt_index, vertex_index, orignu);
+				/* We cannot keep *any* reference to curve obdata,
+				 * it might be replaced and freed while editcurve remain in use (in viewport render case e.g.).
+				 * Note that we could use a pool to avoid lots of malloc's here, but... not really a problem for now. */
+				BPoint *origbp_cpy = MEM_mallocN(sizeof(*origbp_cpy), __func__);
+				*origbp_cpy = *origbp;
+				keyIndex = init_cvKeyIndex(origbp_cpy, key_index, nu_index, pt_index, vertex_index);
 				BLI_ghash_insert(gh, bp, keyIndex);
 				key_index += 4;
 				bp++;
@@ -250,23 +251,22 @@ static int getKeyIndexOrig_keyIndex(EditNurb *editnurb, void *cv)
 	return index->key_index;
 }
 
-static void keyIndex_delCV(EditNurb *editnurb, const void *cv)
+static void keyIndex_delBezt(EditNurb *editnurb, BezTriple *bezt)
 {
 	if (!editnurb->keyindex) {
 		return;
 	}
 
-	BLI_ghash_remove(editnurb->keyindex, cv, NULL, MEM_freeN);
-}
-
-static void keyIndex_delBezt(EditNurb *editnurb, BezTriple *bezt)
-{
-	keyIndex_delCV(editnurb, bezt);
+	BKE_curve_editNurb_keyIndex_delCV(editnurb->keyindex, bezt);
 }
 
 static void keyIndex_delBP(EditNurb *editnurb, BPoint *bp)
 {
-	keyIndex_delCV(editnurb, bp);
+	if (!editnurb->keyindex) {
+		return;
+	}
+
+	BKE_curve_editNurb_keyIndex_delCV(editnurb->keyindex, bp);
 }
 
 static void keyIndex_delNurb(EditNurb *editnurb, Nurb *nu)
@@ -282,7 +282,7 @@ static void keyIndex_delNurb(EditNurb *editnurb, Nurb *nu)
 		a = nu->pntsu;
 
 		while (a--) {
-			BLI_ghash_remove(editnurb->keyindex, bezt, NULL, MEM_freeN);
+			BKE_curve_editNurb_keyIndex_delCV(editnurb->keyindex, bezt);
 			bezt++;
 		}
 	}
@@ -291,7 +291,7 @@ static void keyIndex_delNurb(EditNurb *editnurb, Nurb *nu)
 		a = nu->pntsu * nu->pntsv;
 
 		while (a--) {
-			BLI_ghash_remove(editnurb->keyindex, bp, NULL, MEM_freeN);
+			BKE_curve_editNurb_keyIndex_delCV(editnurb->keyindex, bp);
 			bp++;
 		}
 	}
@@ -535,6 +535,7 @@ static GHash *dupli_keyIndexHash(GHash *keyindex)
 		CVKeyIndex *newIndex = MEM_mallocN(sizeof(CVKeyIndex), "dupli_keyIndexHash index");
 
 		memcpy(newIndex, index, sizeof(CVKeyIndex));
+		newIndex->orig_cv = MEM_dupallocN(index->orig_cv);
 
 		BLI_ghash_insert(gh, cv, newIndex);
 	}
@@ -624,7 +625,7 @@ static void calc_keyHandles(ListBase *nurb, float *key)
 	}
 }
 
-static void calc_shapeKeys(Object *obedit)
+static void calc_shapeKeys(Object *obedit, ListBase *newnurbs)
 {
 	Curve *cu = (Curve *)obedit->data;
 
@@ -636,7 +637,7 @@ static void calc_shapeKeys(Object *obedit)
 		KeyBlock *actkey = BLI_findlink(&cu->key->block, editnurb->shapenr - 1);
 		BezTriple *bezt, *oldbezt;
 		BPoint *bp, *oldbp;
-		Nurb *nu;
+		Nurb *nu, *newnu;
 		int totvert = BKE_nurbList_verts_count(&editnurb->nurbs);
 
 		float (*ofs)[3] = NULL;
@@ -706,20 +707,25 @@ static void calc_shapeKeys(Object *obedit)
 
 		currkey = cu->key->block.first;
 		while (currkey) {
-			int apply_offset = (ofs && (currkey != actkey) && (editnurb->shapenr - 1 == currkey->relative));
+			const bool apply_offset = (ofs && (currkey != actkey) && (editnurb->shapenr - 1 == currkey->relative));
 
 			float *fp = newkey = MEM_callocN(cu->key->elemsize * totvert,  "currkey->data");
 			ofp = oldkey = currkey->data;
 
 			nu = editnurb->nurbs.first;
+			/* We need to restore to original curve into newnurb, *not* editcurve's nurbs.
+			 * Otherwise, in case we update obdata *without* leaving editmode (e.g. viewport render), we would
+			 * invalidate editcurve. */
+			newnu = newnurbs->first;
 			i = 0;
 			while (nu) {
 				if (currkey == actkey) {
-					int restore = actkey != cu->key->refkey;
+					const bool restore = actkey != cu->key->refkey;
 
 					if (nu->bezt) {
 						bezt = nu->bezt;
 						a = nu->pntsu;
+						BezTriple *newbezt = newnu->bezt;
 						while (a--) {
 							int j;
 							oldbezt = getKeyIndexOrig_bezt(editnurb, bezt);
@@ -728,7 +734,7 @@ static void calc_shapeKeys(Object *obedit)
 								copy_v3_v3(fp, bezt->vec[j]);
 
 								if (restore && oldbezt) {
-									copy_v3_v3(bezt->vec[j], oldbezt->vec[j]);
+									copy_v3_v3(newbezt->vec[j], oldbezt->vec[j]);
 								}
 
 								fp += 3;
@@ -736,16 +742,18 @@ static void calc_shapeKeys(Object *obedit)
 							fp[0] = bezt->alfa;
 
 							if (restore && oldbezt) {
-								bezt->alfa = oldbezt->alfa;
+								newbezt->alfa = oldbezt->alfa;
 							}
 
 							fp += 3; ++i; /* alphas */
 							bezt++;
+							newbezt++;
 						}
 					}
 					else {
 						bp = nu->bp;
 						a = nu->pntsu * nu->pntsv;
+						BPoint *newbp = newnu->bp;
 						while (a--) {
 							oldbp = getKeyIndexOrig_bp(editnurb, bp);
 
@@ -754,12 +762,13 @@ static void calc_shapeKeys(Object *obedit)
 							fp[3] = bp->alfa;
 
 							if (restore && oldbp) {
-								copy_v3_v3(bp->vec, oldbp->vec);
-								bp->alfa = oldbp->alfa;
+								copy_v3_v3(newbp->vec, oldbp->vec);
+								newbp->alfa = oldbp->alfa;
 							}
 
 							fp += 4;
 							bp++;
+							newbp++;
 							i += 2;
 						}
 					}
@@ -1204,9 +1213,13 @@ void ED_curve_editnurb_load(Object *obedit)
 			}
 		}
 
+		/* We have to pass also new copied nurbs, since we want to restore original curve (without edited shapekey)
+		 * on obdata, but *not* on editcurve itself (ED_curve_editnurb_load call does not always implies freeing
+		 * of editcurve, e.g. when called to generate render data...). */
+		calc_shapeKeys(obedit, &newnurb);
+
 		cu->nurb = newnurb;
 
-		calc_shapeKeys(obedit);
 		ED_curve_updateAnimPaths(obedit->data);
 
 		BKE_nurbList_free(&oldnurb);
@@ -1227,13 +1240,11 @@ void ED_curve_editnurb_make(Object *obedit)
 		if (actkey) {
 			// XXX strcpy(G.editModeTitleExtra, "(Key) ");
 			undo_editmode_clear();
-			BKE_keyblock_convert_to_curve(actkey, cu, &cu->nurb);
 		}
 
 		if (editnurb) {
 			BKE_nurbList_free(&editnurb->nurbs);
-			BKE_curve_editNurb_keyIndex_free(editnurb);
-			editnurb->keyindex = NULL;
+			BKE_curve_editNurb_keyIndex_free(&editnurb->keyindex);
 		}
 		else {
 			editnurb = MEM_callocN(sizeof(EditNurb), "editnurb");
@@ -1248,12 +1259,16 @@ void ED_curve_editnurb_make(Object *obedit)
 			nu = nu->next;
 		}
 
-		if (actkey)
-			editnurb->shapenr = obedit->shapenr;
-
 		/* animation could be added in editmode even if there was no animdata in
 		 * object mode hence we always need CVs index be created */
 		init_editNurb_keyIndex(editnurb, &cu->nurb);
+
+		if (actkey) {
+			editnurb->shapenr = obedit->shapenr;
+			/* Apply shapekey to new nurbs of editnurb, not those of original curve (and *after* we generated keyIndex),
+			 * else we do not have valid 'original' data to properly restore curve when leaving editmode. */
+			BKE_keyblock_convert_to_curve(actkey, cu, &editnurb->nurbs);
+		}
 	}
 }
 
@@ -1309,8 +1324,7 @@ static int separate_exec(bContext *C, wmOperator *op)
 	ED_curve_editnurb_make(newob);
 	newedit = newcu->editnurb;
 	BKE_nurbList_free(&newedit->nurbs);
-	BKE_curve_editNurb_keyIndex_free(newedit);
-	newedit->keyindex = NULL;
+	BKE_curve_editNurb_keyIndex_free(&newedit->keyindex);
 	BLI_movelisttolist(&newedit->nurbs, &newnurb);
 
 	/* 4. put old object out of editmode and delete separated geometry */
@@ -6110,7 +6124,7 @@ static void undoCurve_to_editCurve(void *ucu, void *UNUSED(edata), void *cu_v)
 	BKE_nurbList_free(editbase);
 
 	if (undoCurve->undoIndex) {
-		BLI_ghash_free(editnurb->keyindex, NULL, MEM_freeN);
+		BKE_curve_editNurb_keyIndex_free(&editnurb->keyindex);
 		editnurb->keyindex = dupli_keyIndexHash(undoCurve->undoIndex);
 	}
 
@@ -6188,8 +6202,7 @@ static void free_undoCurve(void *ucv)
 
 	BKE_nurbList_free(&undoCurve->nubase);
 
-	if (undoCurve->undoIndex)
-		BLI_ghash_free(undoCurve->undoIndex, NULL, MEM_freeN);
+	BKE_curve_editNurb_keyIndex_free(&undoCurve->undoIndex);
 
 	free_fcurves(&undoCurve->fcurves);
 	free_fcurves(&undoCurve->drivers);
diff --git a/source/blender/editors/gpencil/gpencil_edit.c b/source/blender/editors/gpencil/gpencil_edit.c
index 5b011b679a6..601a86b97cb 100644
--- a/source/blender/editors/gpencil/gpencil_edit.c
+++ b/source/blender/editors/gpencil/gpencil_edit.c
@@ -74,7 +74,6 @@
 #include "ED_object.h"
 #include "ED_screen.h"
 #include "ED_view3d.h"
-#include "ED_screen.h"
 #include "ED_space_api.h"
 
 #include "gpencil_intern.h"
@@ -953,6 +952,9 @@ static int gp_dissolve_selected_points(bContext *C)
 			/* skip strokes that are invalid for current view */
 			if (ED_gpencil_stroke_can_use(C, gps) == false)
 				continue;
+			/* check if the color is editable */
+			if (ED_gpencil_stroke_color_use(gpl, gps) == false)
+				continue;
 			
 			if (gps->flag & GP_STROKE_SELECT) {
 				bGPDspoint *pt;
@@ -1165,6 +1167,9 @@ static int gp_delete_selected_points(bContext *C)
 			/* skip strokes that are invalid for current view */
 			if (ED_gpencil_stroke_can_use(C, gps) == false)
 				continue;
+			/* check if the color is editable */
+			if (ED_gpencil_stroke_color_use(gpl, gps) == false)
+				continue;
 			
 			
 			if (gps->flag & GP_STROKE_SELECT) {
@@ -1204,7 +1209,7 @@ static int gp_delete_exec(bContext *C, wmOperator *op)
 		case GP_DELETEOP_POINTS:	/* selected points (breaks the stroke into segments) */
 			result = gp_delete_selected_points(C);
 			break;
-
+		
 		case GP_DELETEOP_FRAME:		/* active frame */
 			result = gp_actframe_delete_exec(C, op);
 			break;
@@ -2121,10 +2126,10 @@ static int gp_count_subdivision_cuts(bGPDstroke *gps)
 	int totnewpoints = 0;
 	for (i = 0, pt = gps->points; i < gps->totpoints && pt; i++, pt++) {
 		if (pt->flag & GP_SPOINT_SELECT) {
-			if (i + 1 < gps->totpoints){
+			if (i + 1 < gps->totpoints) {
 				if (gps->points[i + 1].flag & GP_SPOINT_SELECT) {
 					++totnewpoints;
-				};
+				}
 			}
 		}
 	}
@@ -2179,7 +2184,7 @@ static int gp_stroke_subdivide_exec(bContext *C, wmOperator *op)
 
 					/* if next point is selected add a half way point */
 					if (pt->flag & GP_SPOINT_SELECT) {
-						if (i + 1 < oldtotpoints){
+						if (i + 1 < oldtotpoints) {
 							if (temp_points[i + 1].flag & GP_SPOINT_SELECT) {
 								pt_final = &gps->points[i2];
 								/* Interpolate all values */
@@ -2191,7 +2196,7 @@ static int gp_stroke_subdivide_exec(bContext *C, wmOperator *op)
 								pt_final->time = interpf(pt->time, next->time, 0.5f);
 								pt_final->flag |= GP_SPOINT_SELECT;
 								++i2;
-							};
+							}
 						}
 					}
 				}
diff --git a/source/blender/editors/gpencil/gpencil_interpolate.c b/source/blender/editors/gpencil/gpencil_interpolate.c
index 297058168a0..d2360fea672 100644
--- a/source/blender/editors/gpencil/gpencil_interpolate.c
+++ b/source/blender/editors/gpencil/gpencil_interpolate.c
@@ -86,13 +86,14 @@
 /* Core/Shared Utilities */
 
 /* Poll callback for interpolation operators */
-static int gpencil_interpolate_poll(bContext *C)
+static int gpencil_view3d_poll(bContext *C)
 {
 	bGPdata *gpd = CTX_data_gpencil_data(C);
 	bGPDlayer *gpl = CTX_data_active_gpencil_layer(C);
 	
 	/* only 3D view */
-	if (CTX_wm_area(C)->spacetype != SPACE_VIEW3D) {
+	ScrArea *sa = CTX_wm_area(C);
+	if (sa && sa->spacetype != SPACE_VIEW3D) {
 		return 0;
 	}
 	
@@ -673,7 +674,7 @@ void GPENCIL_OT_interpolate(wmOperatorType *ot)
 	ot->invoke = gpencil_interpolate_invoke;
 	ot->modal = gpencil_interpolate_modal;
 	ot->cancel = gpencil_interpolate_cancel;
-	ot->poll = gpencil_interpolate_poll;
+	ot->poll = gpencil_view3d_poll;
 	
 	/* flags */
 	ot->flag = OPTYPE_UNDO | OPTYPE_BLOCKING;
@@ -1017,7 +1018,7 @@ void GPENCIL_OT_interpolate_sequence(wmOperatorType *ot)
 	
 	/* api callbacks */
 	ot->exec = gpencil_interpolate_seq_exec;
-	ot->poll = gpencil_interpolate_poll;
+	ot->poll = gpencil_view3d_poll;
 	
 	/* flags */
 	ot->flag = OPTYPE_REGISTER | OPTYPE_UNDO;
@@ -1025,24 +1026,14 @@ void GPENCIL_OT_interpolate_sequence(wmOperatorType *ot)
 
 /* ******************** Remove Breakdowns ************************ */
 
-/* Same as gpencil_interpolate_poll(), 
- * except we ALSO need to have an active frame that is a breakdown
- */
 static int gpencil_interpolate_reverse_poll(bContext *C)
 {
-	bGPdata *gpd = CTX_data_gpencil_data(C);
-	bGPDlayer *gpl = CTX_data_active_gpencil_layer(C);
-	
-	/* only 3D view */
-	if (CTX_wm_area(C)->spacetype != SPACE_VIEW3D) {
-		return 0;
-	}
-	
-	/* need data to interpolate */
-	if (ELEM(NULL, gpd, gpl)) {
+	if (!gpencil_view3d_poll(C)) {
 		return 0;
 	}
-	
+
+	bGPDlayer *gpl = CTX_data_active_gpencil_layer(C);
+
 	/* need to be on a breakdown frame */
 	if ((gpl->actframe == NULL) || (gpl->actframe->key_type != BEZT_KEYTYPE_BREAKDOWN)) {
 		CTX_wm_operator_poll_msg_set(C, "Expected current frame to be a breakdown");
diff --git a/source/blender/editors/include/ED_armature.h b/source/blender/editors/include/ED_armature.h
index 7ad61671b1b..6b8943421bd 100644
--- a/source/blender/editors/include/ED_armature.h
+++ b/source/blender/editors/include/ED_armature.h
@@ -130,8 +130,9 @@ void ED_armature_ebone_listbase_temp_clear(struct ListBase *lb);
 void ED_armature_deselect_all(struct Object *obedit);
 void ED_armature_deselect_all_visible(struct Object *obedit);
 
-int ED_do_pose_selectbuffer(struct Scene *scene, struct Base *base, unsigned int *buffer,
-                            short hits, bool extend, bool deselect, bool toggle, bool do_nearest);
+bool ED_do_pose_selectbuffer(
+        struct Scene *scene, struct Base *base, const unsigned int *buffer, short hits,
+        bool extend, bool deselect, bool toggle, bool do_nearest);
 bool ED_armature_select_pick(struct bContext *C, const int mval[2], bool extend, bool deselect, bool toggle);
 int join_armature_exec(struct bContext *C, struct wmOperator *op);
 struct Bone *get_indexed_bone(struct Object *ob, int index);
diff --git a/source/blender/editors/include/ED_view3d.h b/source/blender/editors/include/ED_view3d.h
index 79176d9e9cf..9ce71258573 100644
--- a/source/blender/editors/include/ED_view3d.h
+++ b/source/blender/editors/include/ED_view3d.h
@@ -47,6 +47,7 @@ struct Main;
 struct MetaElem;
 struct Nurb;
 struct Object;
+struct RV3DMatrixStore;
 struct RegionView3D;
 struct Scene;
 struct ScrArea;
@@ -301,7 +302,22 @@ bool ED_view3d_autodist_depth_seg(struct ARegion *ar, const int mval_sta[2], con
 /* select */
 #define MAXPICKELEMS    2500
 #define MAXPICKBUF      (4 * MAXPICKELEMS)
-short view3d_opengl_select(struct ViewContext *vc, unsigned int *buffer, unsigned int bufsize, const struct rcti *input, bool do_nearest);
+
+typedef enum {
+	/* all elements in the region, ignore depth */
+	VIEW3D_SELECT_ALL = 0,
+	/* pick also depth sorts (only for small regions!) */
+	VIEW3D_SELECT_PICK_ALL = 1,
+	/* sorts and only returns visible objects (only for small regions!) */
+	VIEW3D_SELECT_PICK_NEAREST = 2,
+} eV3DSelectMode;
+
+void view3d_opengl_select_cache_begin(void);
+void view3d_opengl_select_cache_end(void);
+
+int view3d_opengl_select(
+        struct ViewContext *vc, unsigned int *buffer, unsigned int bufsize, const struct rcti *input,
+        eV3DSelectMode select_mode);
 
 /* view3d_select.c */
 float ED_view3d_select_dist_px(void);
@@ -330,8 +346,8 @@ void ED_view3d_check_mats_rv3d(struct RegionView3D *rv3d);
 #endif
 int ED_view3d_scene_layer_set(int lay, const int *values, int *active);
 
-void *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d);
-void  ED_view3d_mats_rv3d_restore(struct RegionView3D *rv3d, void *rv3dmat_pt);
+struct RV3DMatrixStore *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d);
+void                    ED_view3d_mats_rv3d_restore(struct RegionView3D *rv3d, struct RV3DMatrixStore *rv3dmat);
 
 bool ED_view3d_context_activate(struct bContext *C);
 void ED_view3d_draw_offscreen_init(struct Scene *scene, struct View3D *v3d);
diff --git a/source/blender/editors/include/UI_icons.h b/source/blender/editors/include/UI_icons.h
index 8579778ff79..8420591aa3e 100644
--- a/source/blender/editors/include/UI_icons.h
+++ b/source/blender/editors/include/UI_icons.h
@@ -653,9 +653,9 @@ DEF_ICON(IPO_BACK)
 DEF_ICON(IPO_EASE_IN)
 DEF_ICON(IPO_EASE_OUT)
 DEF_ICON(IPO_EASE_IN_OUT)
+DEF_ICON(NORMALIZE_FCURVES)
 #ifndef DEF_ICON_BLANK_SKIP
 	/* available */
-	DEF_ICON(BLANK203)
 	DEF_ICON(BLANK204)
 	DEF_ICON(BLANK205)
 	DEF_ICON(BLANK206)
diff --git a/source/blender/editors/include/UI_interface.h b/source/blender/editors/include/UI_interface.h
index 9fbce7dd203..2ce4f3e2790 100644
--- a/source/blender/editors/include/UI_interface.h
+++ b/source/blender/editors/include/UI_interface.h
@@ -421,7 +421,7 @@ typedef void (*uiBlockCancelFunc)(struct bContext *C, void *arg1);
 
 void UI_popup_block_invoke(struct bContext *C, uiBlockCreateFunc func, void *arg);
 void UI_popup_block_invoke_ex(struct bContext *C, uiBlockCreateFunc func, void *arg, const char *opname, int opcontext);
-void UI_popup_block_ex(struct bContext *C, uiBlockCreateFunc func, uiBlockHandleFunc popup_func, uiBlockCancelFunc cancel_func, void *arg);
+void UI_popup_block_ex(struct bContext *C, uiBlockCreateFunc func, uiBlockHandleFunc popup_func, uiBlockCancelFunc cancel_func, void *arg, struct wmOperator *op);
 /* void uiPupBlockOperator(struct bContext *C, uiBlockCreateFunc func, struct wmOperator *op, int opcontext); */ /* UNUSED */
 
 void UI_popup_block_close(struct bContext *C, struct wmWindow *win, uiBlock *block);
diff --git a/source/blender/editors/interface/interface.c b/source/blender/editors/interface/interface.c
index 682db20af55..7180e18ab92 100644
--- a/source/blender/editors/interface/interface.c
+++ b/source/blender/editors/interface/interface.c
@@ -1337,7 +1337,7 @@ static void ui_but_to_pixelrect(rcti *rect, const ARegion *ar, uiBlock *block, u
 	rctf rectf;
 
 	ui_block_to_window_rctf(ar, block, &rectf, (but) ? &but->rect : &block->rect);
-	BLI_rcti_rctf_copy_floor(rect, &rectf);
+	BLI_rcti_rctf_copy(rect, &rectf);
 	BLI_rcti_translate(rect, -ar->winrct.xmin, -ar->winrct.ymin);
 }
 
@@ -2288,7 +2288,7 @@ char *ui_but_string_get_dynamic(uiBut *but, int *r_str_size)
 
 #ifdef WITH_PYTHON
 
-static bool ui_set_but_string_eval_num_unit(bContext *C, uiBut *but, const char *str, double *value)
+static bool ui_set_but_string_eval_num_unit(bContext *C, uiBut *but, const char *str, double *r_value)
 {
 	char str_unit_convert[256];
 	const int unit_type = UI_but_unit_type_get(but);
@@ -2300,13 +2300,13 @@ static bool ui_set_but_string_eval_num_unit(bContext *C, uiBut *but, const char
 	bUnit_ReplaceString(str_unit_convert, sizeof(str_unit_convert), but->drawstr,
 	                    ui_get_but_scale_unit(but, 1.0), but->block->unit->system, RNA_SUBTYPE_UNIT_VALUE(unit_type));
 
-	return BPY_execute_string_as_number(C, str_unit_convert, value, true);
+	return BPY_execute_string_as_number(C, str_unit_convert, true, r_value);
 }
 
 #endif /* WITH_PYTHON */
 
 
-bool ui_but_string_set_eval_num(bContext *C, uiBut *but, const char *str, double *value)
+bool ui_but_string_set_eval_num(bContext *C, uiBut *but, const char *str, double *r_value)
 {
 	bool ok = false;
 
@@ -2315,13 +2315,13 @@ bool ui_but_string_set_eval_num(bContext *C, uiBut *but, const char *str, double
 	if (str[0] != '\0') {
 		bool is_unit_but = (ui_but_is_float(but) && ui_but_is_unit(but));
 		/* only enable verbose if we won't run again with units */
-		if (BPY_execute_string_as_number(C, str, value, is_unit_but == false)) {
+		if (BPY_execute_string_as_number(C, str, is_unit_but == false, r_value)) {
 			/* if the value parsed ok without unit conversion this button may still need a unit multiplier */
 			if (is_unit_but) {
 				char str_new[128];
 
-				BLI_snprintf(str_new, sizeof(str_new), "%f", *value);
-				ok = ui_set_but_string_eval_num_unit(C, but, str_new, value);
+				BLI_snprintf(str_new, sizeof(str_new), "%f", *r_value);
+				ok = ui_set_but_string_eval_num_unit(C, but, str_new, r_value);
 			}
 			else {
 				ok = true; /* parse normal string via py (no unit conversion needed) */
@@ -2329,7 +2329,7 @@ bool ui_but_string_set_eval_num(bContext *C, uiBut *but, const char *str, double
 		}
 		else if (is_unit_but) {
 			/* parse failed, this is a unit but so run replacements and parse again */
-			ok = ui_set_but_string_eval_num_unit(C, but, str, value);
+			ok = ui_set_but_string_eval_num_unit(C, but, str, r_value);
 		}
 	}
 
@@ -3913,6 +3913,8 @@ uiBut *uiDefIconTextButO_ptr(uiBlock *block, int type, wmOperatorType *ot, int o
 uiBut *uiDefIconTextButO(uiBlock *block, int type, const char *opname, int opcontext, int icon, const char *str, int x, int y, short width, short height, const char *tip)
 {
 	wmOperatorType *ot = WM_operatortype_find(opname, 0);
+	if (str && str[0] == '\0') 
+		return uiDefIconButO_ptr(block, type, ot, opcontext, icon, x, y, width, height, tip);
 	return uiDefIconTextButO_ptr(block, type, ot, opcontext, icon, str, x, y, width, height, tip);
 }
 
diff --git a/source/blender/editors/interface/interface_anim.c b/source/blender/editors/interface/interface_anim.c
index 5da294302e9..a04360b3395 100644
--- a/source/blender/editors/interface/interface_anim.c
+++ b/source/blender/editors/interface/interface_anim.c
@@ -39,6 +39,7 @@
 #include "BLI_utildefines.h"
 
 #include "BKE_context.h"
+#include "BKE_depsgraph.h"
 #include "BKE_fcurve.h"
 #include "BKE_global.h"
 #include "BKE_nla.h"
@@ -210,6 +211,7 @@ bool ui_but_anim_expression_create(uiBut *but, const char *str)
 
 			/* updates */
 			driver->flag |= DRIVER_FLAG_RECOMPILE;
+			DAG_relations_tag_update(CTX_data_main(C));
 			WM_event_add_notifier(C, NC_ANIMATION | ND_KEYFRAME, NULL);
 			ok = true;
 		}
diff --git a/source/blender/editors/interface/interface_eyedropper.c b/source/blender/editors/interface/interface_eyedropper.c
index 5154a77ad21..f3859154dfb 100644
--- a/source/blender/editors/interface/interface_eyedropper.c
+++ b/source/blender/editors/interface/interface_eyedropper.c
@@ -1082,7 +1082,7 @@ static int depthdropper_poll(bContext *C)
 			return 1;
 		}
 	}
-	else  {
+	else {
 		RegionView3D *rv3d = CTX_wm_region_view3d(C);
 		if (rv3d && rv3d->persp == RV3D_CAMOB) {
 			View3D *v3d = CTX_wm_view3d(C);
diff --git a/source/blender/editors/interface/interface_handlers.c b/source/blender/editors/interface/interface_handlers.c
index 734cd02a056..2974c2e9304 100644
--- a/source/blender/editors/interface/interface_handlers.c
+++ b/source/blender/editors/interface/interface_handlers.c
@@ -87,6 +87,7 @@
 
 #include "WM_api.h"
 #include "WM_types.h"
+#include "wm_event_system.h"
 
 #ifdef WITH_INPUT_IME
 #  include "wm_window.h"
@@ -380,6 +381,7 @@ typedef struct uiAfterFunc {
 	void *butm_func_arg;
 	int a2;
 
+	wmOperator *popup_op;
 	wmOperatorType *optype;
 	int opcontext;
 	PointerRNA *opptr;
@@ -635,13 +637,24 @@ PointerRNA *ui_handle_afterfunc_add_operator(wmOperatorType *ot, int opcontext,
 	return ptr;
 }
 
+static void popup_check(bContext *C, wmOperator *op)
+{
+	if (op && op->type->check && op->type->check(C, op)) {
+		/* check for popup and re-layout buttons */
+		ARegion *ar_menu = CTX_wm_menu(C);
+		if (ar_menu)
+			ED_region_tag_refresh_ui(ar_menu);
+	}
+}
+
 /**
  * Check if a #uiAfterFunc is needed for this button.
  */
 static bool ui_afterfunc_check(const uiBlock *block, const uiBut *but)
 {
 	return (but->func || but->funcN || but->rename_func || but->optype || but->rnaprop || block->handle_func ||
-	        (but->type == UI_BTYPE_BUT_MENU && block->butm_func));
+	        (but->type == UI_BTYPE_BUT_MENU && block->butm_func) ||
+	        (block->handle && block->handle->popup_op));
 }
 
 static void ui_apply_but_func(bContext *C, uiBut *but)
@@ -682,6 +695,9 @@ static void ui_apply_but_func(bContext *C, uiBut *but)
 			after->butm_func_arg = block->butm_func_arg;
 			after->a2 = but->a2;
 		}
+		
+		if (block->handle)
+			after->popup_op = block->handle->popup_op;
 
 		after->optype = but->optype;
 		after->opcontext = but->opcontext;
@@ -766,6 +782,9 @@ static void ui_apply_but_funcs_after(bContext *C)
 		if (after.context)
 			CTX_store_set(C, after.context);
 
+		if (after.popup_op)
+			popup_check(C, after.popup_op);
+		
 		if (after.opptr) {
 			/* free in advance to avoid leak on exit */
 			opptr = *after.opptr;
@@ -6665,7 +6684,7 @@ static void remove_shortcut_func(bContext *C, void *arg1, void *UNUSED(arg2))
 static void popup_add_shortcut_func(bContext *C, void *arg1, void *UNUSED(arg2))
 {
 	uiBut *but = (uiBut *)arg1;
-	UI_popup_block_ex(C, menu_add_shortcut, NULL, menu_add_shortcut_cancel, but);
+	UI_popup_block_ex(C, menu_add_shortcut, NULL, menu_add_shortcut_cancel, but, NULL);
 }
 
 /**
@@ -7005,20 +7024,17 @@ static bool ui_but_menu(bContext *C, uiBut *but)
 			uiItemO(layout, CTX_IFACE_(BLT_I18NCONTEXT_OPERATOR_DEFAULT, "Online Manual"),
 			        ICON_URL, "WM_OT_doc_view_manual_ui_context");
 
-			WM_operator_properties_create(&ptr_props, "WM_OT_doc_view");
+			ptr_props = uiItemFullO(layout, "WM_OT_doc_view",
+			                            CTX_IFACE_(BLT_I18NCONTEXT_OPERATOR_DEFAULT, "Online Python Reference"),
+			                            ICON_NONE, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 			RNA_string_set(&ptr_props, "doc_id", buf);
-			uiItemFullO(layout, "WM_OT_doc_view",
-			            CTX_IFACE_(BLT_I18NCONTEXT_OPERATOR_DEFAULT, "Online Python Reference"),
-			            ICON_NONE, ptr_props.data, WM_OP_EXEC_DEFAULT, 0);
 
 			/* XXX inactive option, not for public! */
 #if 0
-			WM_operator_properties_create(&ptr_props, "WM_OT_doc_edit");
+			ptr_props = uiItemFullO(layout, "WM_OT_doc_edit",
+			                            "Submit Description", ICON_NONE, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 			RNA_string_set(&ptr_props, "doc_id", buf);
 			RNA_string_set(&ptr_props, "doc_new", RNA_property_description(but->rnaprop));
-
-			uiItemFullO(layout, "WM_OT_doc_edit",
-			            "Submit Description", ICON_NONE, ptr_props.data, WM_OP_INVOKE_DEFAULT, 0);
 #endif
 		}
 	}
@@ -7741,7 +7757,8 @@ static void button_activate_state(bContext *C, uiBut *but, uiHandleButtonState s
 		if (ui_but_is_cursor_warp(but)) {
 
 #ifdef USE_CONT_MOUSE_CORRECT
-			if (data->ungrab_mval[0] != FLT_MAX) {
+			/* stereo3d has issues with changing cursor location so rather avoid */
+			if (data->ungrab_mval[0] != FLT_MAX && !WM_stereo3d_enabled(data->window, false)) {
 				int mouse_ungrab_xy[2];
 				ui_block_to_window_fl(data->region, but->block, &data->ungrab_mval[0], &data->ungrab_mval[1]);
 				mouse_ungrab_xy[0] = data->ungrab_mval[0];
@@ -10181,6 +10198,25 @@ void UI_popup_handlers_add(bContext *C, ListBase *handlers, uiPopupBlockHandle *
 
 void UI_popup_handlers_remove(ListBase *handlers, uiPopupBlockHandle *popup)
 {
+	wmEventHandler *handler;
+
+	for (handler = handlers->first; handler; handler = handler->next) {
+		if (handler->ui_handle == ui_popup_handler &&
+		    handler->ui_remove == ui_popup_handler_remove &&
+		    handler->ui_userdata == popup)
+		{
+			/* tag refresh parent popup */
+			if (handler->next && 
+				handler->next->ui_handle == ui_popup_handler && 
+				handler->next->ui_remove == ui_popup_handler_remove) 
+			{
+				uiPopupBlockHandle *parent_popup = handler->next->ui_userdata;
+				ED_region_tag_refresh_ui(parent_popup->region);
+			}
+			break;
+		}
+	}
+
 	WM_event_remove_ui_handler(handlers, ui_popup_handler, ui_popup_handler_remove, popup, false);
 }
 
diff --git a/source/blender/editors/interface/interface_intern.h b/source/blender/editors/interface/interface_intern.h
index d8f9fdcbaae..067279777ba 100644
--- a/source/blender/editors/interface/interface_intern.h
+++ b/source/blender/editors/interface/interface_intern.h
@@ -556,6 +556,7 @@ struct uiPopupBlockHandle {
 	struct uiKeyNavLock keynav_state;
 
 	/* for operator popups */
+	struct wmOperator *popup_op;
 	struct wmOperatorType *optype;
 	ScrArea *ctx_area;
 	ARegion *ctx_region;
diff --git a/source/blender/editors/interface/interface_layout.c b/source/blender/editors/interface/interface_layout.c
index ca2538022b0..9b6547cf8a1 100644
--- a/source/blender/editors/interface/interface_layout.c
+++ b/source/blender/editors/interface/interface_layout.c
@@ -189,7 +189,7 @@ static const char *ui_item_name_add_colon(const char *name, char namestr[UI_MAX_
 	return name;
 }
 
-static int ui_item_fit(int item, int pos, int all, int available, bool is_last, int alignment)
+static int ui_item_fit(int item, int pos, int all, int available, bool is_last, int alignment, float *extra_pixel)
 {
 	/* available == 0 is unlimited */
 	if (available == 0)
@@ -199,16 +199,22 @@ static int ui_item_fit(int item, int pos, int all, int available, bool is_last,
 		/* contents is bigger than available space */
 		if (is_last)
 			return available - pos;
-		else
-			return (item * available) / all;
+		else {
+			float width = *extra_pixel + (item * available) / (float)all;
+			*extra_pixel = width - (int)width;
+			return (int)width;
+		}
 	}
 	else {
 		/* contents is smaller or equal to available space */
 		if (alignment == UI_LAYOUT_ALIGN_EXPAND) {
 			if (is_last)
 				return available - pos;
-			else
-				return (item * available) / all;
+			else {
+				float width = *extra_pixel + (item * available) / (float)all;
+				*extra_pixel = width - (int)width;
+				return (int)width;
+			}
 		}
 		else
 			return item;
@@ -302,6 +308,26 @@ static void ui_item_position(uiItem *item, int x, int y, int w, int h)
 	}
 }
 
+static void ui_item_move(uiItem *item, int delta_xmin, int delta_xmax)
+{
+	if (item->type == ITEM_BUTTON) {
+		uiButtonItem *bitem = (uiButtonItem *)item;
+
+		bitem->but->rect.xmin += delta_xmin;
+		bitem->but->rect.xmax += delta_xmax;
+		
+		ui_but_update(bitem->but); /* for strlen */
+	}
+	else {
+		uiLayout *litem = (uiLayout *)item;
+
+		if (delta_xmin > 0)
+			litem->x += delta_xmin;
+		else
+			litem->w += delta_xmax;
+	}
+}
+
 /******************** Special RNA Items *********************/
 
 static int ui_layout_local_dir(uiLayout *layout)
@@ -2099,9 +2125,10 @@ static int ui_litem_min_width(int itemw)
 
 static void ui_litem_layout_row(uiLayout *litem)
 {
-	uiItem *item;
+	uiItem *item, *last_free_item = NULL;
 	int x, y, w, tot, totw, neww, newtotw, itemw, minw, itemh, offset;
 	int fixedw, freew, fixedx, freex, flag = 0, lastw = 0;
+	float extra_pixel;
 
 	/* x = litem->x; */ /* UNUSED */
 	y = litem->y;
@@ -2128,6 +2155,7 @@ static void ui_litem_layout_row(uiLayout *litem)
 		x = 0;
 		flag = 0;
 		newtotw = totw;
+		extra_pixel = 0.0f;
 
 		for (item = litem->items.first; item; item = item->next) {
 			if (item->flag & UI_ITEM_FIXED)
@@ -2137,13 +2165,23 @@ static void ui_litem_layout_row(uiLayout *litem)
 			minw = ui_litem_min_width(itemw);
 
 			if (w - lastw > 0)
-				neww = ui_item_fit(itemw, x, totw, w - lastw, !item->next, litem->alignment);
+				neww = ui_item_fit(itemw, x, totw, w - lastw, !item->next, litem->alignment, &extra_pixel);
 			else
 				neww = 0;  /* no space left, all will need clamping to minimum size */
 
 			x += neww;
 
-			if ((neww < minw || itemw == minw || item->flag & UI_ITEM_MIN) && w != 0) {
+			bool min_flag = item->flag & UI_ITEM_MIN;
+			/* ignore min flag for rows with right or center alignment */
+			if (item->type != ITEM_BUTTON &&
+			    ELEM(((uiLayout *)item)->alignment, UI_LAYOUT_ALIGN_RIGHT, UI_LAYOUT_ALIGN_CENTER) &&
+			    litem->alignment == UI_LAYOUT_ALIGN_EXPAND && 
+			    ((uiItem *)litem)->flag & UI_ITEM_MIN)
+			{
+				min_flag = false;
+			}
+
+			if ((neww < minw || min_flag) && w != 0) {
 				/* fixed size */
 				item->flag |= UI_ITEM_FIXED;
 				if (item->type != ITEM_BUTTON && item->flag & UI_ITEM_MIN) {
@@ -2166,6 +2204,7 @@ static void ui_litem_layout_row(uiLayout *litem)
 
 	freex = 0;
 	fixedx = 0;
+	extra_pixel = 0.0f;
 	x = litem->x;
 
 	for (item = litem->items.first; item; item = item->next) {
@@ -2177,13 +2216,14 @@ static void ui_litem_layout_row(uiLayout *litem)
 			if (item->type != ITEM_BUTTON && item->flag & UI_ITEM_MIN) {
 				minw = itemw;
 			}
-			itemw = ui_item_fit(minw, fixedx, fixedw, min_ii(w, fixedw), !item->next, litem->alignment);
+			itemw = ui_item_fit(minw, fixedx, fixedw, min_ii(w, fixedw), !item->next, litem->alignment, &extra_pixel);
 			fixedx += itemw;
 		}
 		else {
 			/* free size item */
-			itemw = ui_item_fit(itemw, freex, freew, w - fixedw, !item->next, litem->alignment);
+			itemw = ui_item_fit(itemw, freex, freew, w - fixedw, !item->next, litem->alignment, &extra_pixel);
 			freex += itemw;
+			last_free_item = item;
 		}
 
 		/* align right/center */
@@ -2205,6 +2245,17 @@ static void ui_litem_layout_row(uiLayout *litem)
 			x += litem->space;
 	}
 
+	/* add extra pixel */
+	uiItem *last_item = litem->items.last;
+	extra_pixel = litem->w - (x - litem->x);
+	if (extra_pixel > 0 && litem->alignment == UI_LAYOUT_ALIGN_EXPAND &&
+	    last_free_item && last_item && last_item->flag & UI_ITEM_FIXED)
+	{
+		ui_item_move(last_free_item, 0, extra_pixel);
+		for (item = last_free_item->next; item; item = item->next)
+			ui_item_move(item, extra_pixel, extra_pixel);
+	}
+
 	litem->w = x - litem->x;
 	litem->h = litem->y - y;
 	litem->x = x;
@@ -2212,7 +2263,7 @@ static void ui_litem_layout_row(uiLayout *litem)
 }
 
 /* single-column layout */
-static void ui_litem_estimate_column(uiLayout *litem)
+static void ui_litem_estimate_column(uiLayout *litem, bool is_box)
 {
 	uiItem *item;
 	int itemw, itemh;
@@ -2229,16 +2280,16 @@ static void ui_litem_estimate_column(uiLayout *litem)
 		litem->w = MAX2(litem->w, itemw);
 		litem->h += itemh;
 
-		if (item->next)
+		if (item->next && (!is_box || item != litem->items.first))
 			litem->h += litem->space;
 	}
-
+	
 	if (min_size_flag) {
 		litem->item.flag |= UI_ITEM_MIN;
 	}
 }
 
-static void ui_litem_layout_column(uiLayout *litem)
+static void ui_litem_layout_column(uiLayout *litem, bool is_box)
 {
 	uiItem *item;
 	int itemh, x, y;
@@ -2252,7 +2303,7 @@ static void ui_litem_layout_column(uiLayout *litem)
 		y -= itemh;
 		ui_item_position(item, x, y, litem->w, itemh);
 
-		if (item->next)
+		if (item->next && (!is_box || item != litem->items.first))
 			y -= litem->space;
 	}
 
@@ -2394,7 +2445,7 @@ static void ui_litem_layout_root(uiLayout *litem)
 	else if (litem->root->type == UI_LAYOUT_PIEMENU)
 		ui_litem_layout_root_radial(litem);
 	else
-		ui_litem_layout_column(litem);
+		ui_litem_layout_column(litem, false);
 }
 
 /* box layout */
@@ -2402,9 +2453,10 @@ static void ui_litem_estimate_box(uiLayout *litem)
 {
 	uiStyle *style = litem->root->style;
 
-	ui_litem_estimate_column(litem);
+	ui_litem_estimate_column(litem, true);
+	litem->item.flag &= ~UI_ITEM_MIN;
 	litem->w += 2 * style->boxspace;
-	litem->h += style->boxspace;
+	litem->h += 2 * style->boxspace;
 }
 
 static void ui_litem_layout_box(uiLayout *litem)
@@ -2418,17 +2470,18 @@ static void ui_litem_layout_box(uiLayout *litem)
 	h = litem->h;
 
 	litem->x += style->boxspace;
+	litem->y -= style->boxspace;
 
 	if (w != 0) litem->w -= 2 * style->boxspace;
 	if (h != 0) litem->h -= 2 * style->boxspace;
 
-	ui_litem_layout_column(litem);
+	ui_litem_layout_column(litem, true);
 
 	litem->x -= style->boxspace;
 	litem->y -= style->boxspace;
 
 	if (w != 0) litem->w += 2 * style->boxspace;
-	if (h != 0) litem->h += style->boxspace;
+	if (h != 0) litem->h += 2 * style->boxspace;
 
 	/* roundbox around the sublayout */
 	but = box->roundbox;
@@ -2648,13 +2701,14 @@ static void ui_litem_layout_absolute(uiLayout *litem)
 static void ui_litem_estimate_split(uiLayout *litem)
 {
 	ui_litem_estimate_row(litem);
+	litem->item.flag &= ~UI_ITEM_MIN;
 }
 
 static void ui_litem_layout_split(uiLayout *litem)
 {
 	uiLayoutItemSplit *split = (uiLayoutItemSplit *)litem;
 	uiItem *item;
-	float percentage;
+	float percentage, extra_pixel = 0.0f;
 	const int tot = BLI_listbase_count(&litem->items);
 	int itemh, x, y, w, colw = 0;
 
@@ -2677,7 +2731,9 @@ static void ui_litem_layout_split(uiLayout *litem)
 		x += colw;
 
 		if (item->next) {
-			colw = (w - (int)(w * percentage)) / (tot - 1);
+			const float width = extra_pixel + (w - (int)(w * percentage)) / ((float)tot - 1);
+			extra_pixel = width - (int)width;
+			colw = (int)width;
 			colw = MAX2(colw, 0);
 
 			x += litem->space;
@@ -3078,7 +3134,7 @@ static void ui_item_estimate(uiItem *item)
 
 		switch (litem->item.type) {
 			case ITEM_LAYOUT_COLUMN:
-				ui_litem_estimate_column(litem);
+				ui_litem_estimate_column(litem, false);
 				break;
 			case ITEM_LAYOUT_COLUMN_FLOW:
 				ui_litem_estimate_column_flow(litem);
@@ -3133,9 +3189,9 @@ static void ui_item_align(uiLayout *litem, short nr)
 		}
 		else if (item->type == ITEM_LAYOUT_BOX) {
 			box = (uiLayoutItemBx *)item;
-			box->roundbox->alignnr = nr;
-			BLI_remlink(&litem->root->block->buttons, box->roundbox);
-			BLI_addhead(&litem->root->block->buttons, box->roundbox);
+			if (!box->roundbox->alignnr) {
+				box->roundbox->alignnr = nr;
+			}
 		}
 		else if (((uiLayout *)item)->align) {
 			ui_item_align((uiLayout *)item, nr);
@@ -3177,7 +3233,7 @@ static void ui_item_layout(uiItem *item)
 
 		switch (litem->item.type) {
 			case ITEM_LAYOUT_COLUMN:
-				ui_litem_layout_column(litem);
+				ui_litem_layout_column(litem, false);
 				break;
 			case ITEM_LAYOUT_COLUMN_FLOW:
 				ui_litem_layout_column_flow(litem);
@@ -3521,14 +3577,13 @@ void uiLayoutOperatorButs(
 		row = uiLayoutRow(layout, true);
 		uiItemM(row, (bContext *)C, "WM_MT_operator_presets", NULL, ICON_NONE);
 
-		WM_operator_properties_create(&op_ptr, "WM_OT_operator_preset_add");
+		wmOperatorType *ot = WM_operatortype_find("WM_OT_operator_preset_add", false);
+		op_ptr = uiItemFullO_ptr(row, ot, "", ICON_ZOOMIN, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 		RNA_string_set(&op_ptr, "operator", op->type->idname);
-		uiItemFullO(row, "WM_OT_operator_preset_add", "", ICON_ZOOMIN, op_ptr.data, WM_OP_INVOKE_DEFAULT, 0);
 
-		WM_operator_properties_create(&op_ptr, "WM_OT_operator_preset_add");
+		op_ptr = uiItemFullO_ptr(row, ot, "", ICON_ZOOMOUT, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 		RNA_string_set(&op_ptr, "operator", op->type->idname);
 		RNA_boolean_set(&op_ptr, "remove_active", true);
-		uiItemFullO(row, "WM_OT_operator_preset_add", "", ICON_ZOOMOUT, op_ptr.data, WM_OP_INVOKE_DEFAULT, 0);
 	}
 
 	if (op->type->ui) {
diff --git a/source/blender/editors/interface/interface_ops.c b/source/blender/editors/interface/interface_ops.c
index 40ebc946e79..fb95cdf389b 100644
--- a/source/blender/editors/interface/interface_ops.c
+++ b/source/blender/editors/interface/interface_ops.c
@@ -739,6 +739,7 @@ static int editsource_text_edit(
 
 	if (text == NULL) {
 		text = BKE_text_load(bmain, filepath, bmain->name);
+		id_us_ensure_real(&text->id);
 	}
 
 	if (text == NULL) {
diff --git a/source/blender/editors/interface/interface_regions.c b/source/blender/editors/interface/interface_regions.c
index 466978272bc..534bd4278ca 100644
--- a/source/blender/editors/interface/interface_regions.c
+++ b/source/blender/editors/interface/interface_regions.c
@@ -1692,6 +1692,28 @@ static void ui_block_region_draw(const bContext *C, ARegion *ar)
 		UI_block_draw(C, block);
 }
 
+/**
+ * Use to refresh centered popups on screen resizing (for splash).
+ */
+static void ui_block_region_popup_window_listener(
+        bScreen *UNUSED(sc), ScrArea *UNUSED(sa), ARegion *ar, wmNotifier *wmn)
+{
+	switch (wmn->category) {
+		case NC_WINDOW:
+		{
+			switch (wmn->action) {
+				case NA_EDITED:
+				{
+					/* window resize */
+					ED_region_tag_refresh_ui(ar);
+					break;
+				}
+			}
+			break;
+		}
+	}
+}
+
 static void ui_popup_block_clip(wmWindow *window, uiBlock *block)
 {
 	uiBut *bt;
@@ -2003,6 +2025,11 @@ uiPopupBlockHandle *ui_popup_block_create(
 	block = ui_popup_block_refresh(C, handle, butregion, but);
 	handle = block->handle;
 
+	/* keep centered on window resizing */
+	if ((block->bounds_type == UI_BLOCK_BOUNDS_POPUP_CENTER) && handle->can_refresh) {
+		type.listener = ui_block_region_popup_window_listener;
+	}
+
 	return handle;
 }
 
@@ -3285,7 +3312,7 @@ void UI_popup_block_invoke(bContext *C, uiBlockCreateFunc func, void *arg)
 	UI_popup_block_invoke_ex(C, func, arg, NULL, WM_OP_INVOKE_DEFAULT);
 }
 
-void UI_popup_block_ex(bContext *C, uiBlockCreateFunc func, uiBlockHandleFunc popup_func, uiBlockCancelFunc cancel_func, void *arg)
+void UI_popup_block_ex(bContext *C, uiBlockCreateFunc func, uiBlockHandleFunc popup_func, uiBlockCancelFunc cancel_func, void *arg, wmOperator *op)
 {
 	wmWindow *window = CTX_wm_window(C);
 	uiPopupBlockHandle *handle;
@@ -3294,6 +3321,7 @@ void UI_popup_block_ex(bContext *C, uiBlockCreateFunc func, uiBlockHandleFunc po
 	handle->popup = true;
 	handle->retvalue = 1;
 
+	handle->popup_op = op;
 	handle->popup_arg = arg;
 	handle->popup_func = popup_func;
 	handle->cancel_func = cancel_func;
diff --git a/source/blender/editors/interface/interface_templates.c b/source/blender/editors/interface/interface_templates.c
index 4db1c845c23..131584dd405 100644
--- a/source/blender/editors/interface/interface_templates.c
+++ b/source/blender/editors/interface/interface_templates.c
@@ -1978,6 +1978,7 @@ static void curvemap_tools_dofunc(bContext *C, void *cumap_v, int event)
 		case UICURVE_FUNC_HANDLE_AUTO_ANIM: /* set auto-clamped */
 			curvemap_handle_set(cuma, HD_AUTO_ANIM);
 			curvemapping_changed(cumap, false);
+			break;
 		case UICURVE_FUNC_EXTEND_HOZ: /* extend horiz */
 			cuma->flag &= ~CUMA_EXTEND_EXTRAPOLATE;
 			curvemapping_changed(cumap, false);
diff --git a/source/blender/editors/interface/interface_utils.c b/source/blender/editors/interface/interface_utils.c
index df6f098ee81..636b7e4e9ce 100644
--- a/source/blender/editors/interface/interface_utils.c
+++ b/source/blender/editors/interface/interface_utils.c
@@ -127,12 +127,10 @@ uiBut *uiDefAutoButR(uiBlock *block, PointerRNA *ptr, PropertyRNA *prop, int ind
 			break;
 		case PROP_POINTER:
 		{
-			PointerRNA pptr;
-
-			pptr = RNA_property_pointer_get(ptr, prop);
-			if (!pptr.type)
-				pptr.type = RNA_property_pointer_type(ptr, prop);
-			icon = RNA_struct_ui_icon(pptr.type);
+			if (icon == 0) {
+				PointerRNA pptr = RNA_property_pointer_get(ptr, prop);
+				icon = RNA_struct_ui_icon(pptr.type ? pptr.type : RNA_property_pointer_type(ptr, prop));
+			}
 			if (icon == ICON_DOT)
 				icon = 0;
 
diff --git a/source/blender/editors/interface/interface_widgets.c b/source/blender/editors/interface/interface_widgets.c
index d43a94c5514..f7f2b422724 100644
--- a/source/blender/editors/interface/interface_widgets.c
+++ b/source/blender/editors/interface/interface_widgets.c
@@ -874,21 +874,18 @@ static void widget_draw_icon(
 		float ofs = 1.0f / aspect;
 		
 		if (but->drawflag & UI_BUT_ICON_LEFT) {
-			if (but->block->flag & UI_BLOCK_LOOP) {
-				if (but->type == UI_BTYPE_SEARCH_MENU)
-					xs = rect->xmin + 4.0f * ofs;
-				else
-					xs = rect->xmin + ofs;
-			}
-			else {
+			/* special case - icon_only pie buttons */
+			if (ui_block_is_pie_menu(but->block) && but->type != UI_BTYPE_MENU && but->str && but->str[0] == '\0')
+				xs = rect->xmin + 2.0f * ofs;
+			else if (but->dt == UI_EMBOSS_NONE || but->type == UI_BTYPE_LABEL)
+				xs = rect->xmin + 2.0f * ofs;
+			else
 				xs = rect->xmin + 4.0f * ofs;
-			}
-			ys = (rect->ymin + rect->ymax - height) / 2.0f;
 		}
 		else {
 			xs = (rect->xmin + rect->xmax - height) / 2.0f;
-			ys = (rect->ymin + rect->ymax - height) / 2.0f;
 		}
+		ys = (rect->ymin + rect->ymax - height) / 2.0f;
 
 		/* force positions to integers, for zoom levels near 1. draws icons crisp. */
 		if (aspect > 0.95f && aspect < 1.05f) {
@@ -1554,11 +1551,15 @@ static void widget_draw_text_icon(uiFontStyle *fstyle, uiWidgetColors *wcol, uiB
 	/* Icons on the left with optional text label on the right */
 	else if (but->flag & UI_HAS_ICON || show_menu_icon) {
 		const BIFIconID icon = (but->flag & UI_HAS_ICON) ? but->icon + but->iconadd : ICON_NONE;
-		const float icon_size = ICON_SIZE_FROM_BUTRECT(rect);
+		const float icon_size = ICON_DEFAULT_WIDTH_SCALE;
 
 		/* menu item - add some more padding so menus don't feel cramped. it must
 		 * be part of the button so that this area is still clickable */
-		if (ui_block_is_menu(but->block))
+		if (ui_block_is_pie_menu(but->block)) {
+			if (but->dt == UI_EMBOSS_RADIAL)
+				rect->xmin += 0.3f * U.widget_unit;
+		}
+		else if (ui_block_is_menu(but->block))
 			rect->xmin += 0.3f * U.widget_unit;
 
 		widget_draw_icon(but, icon, alpha, rect, show_menu_icon);
@@ -1584,7 +1585,7 @@ static void widget_draw_text_icon(uiFontStyle *fstyle, uiWidgetColors *wcol, uiB
 		temp.xmin = temp.xmax - (BLI_rcti_size_y(rect) * 1.08f);
 
 		if (extra_icon_type == UI_BUT_ICONEXTRA_CLEAR) {
-			widget_draw_icon(but, ICON_X, alpha, &temp, false);
+			widget_draw_icon(but, ICON_PANEL_CLOSE, alpha, &temp, false);
 		}
 		else if (extra_icon_type == UI_BUT_ICONEXTRA_EYEDROPPER) {
 			widget_draw_icon(but, ICON_EYEDROPPER, alpha, &temp, false);
diff --git a/source/blender/editors/io/io_alembic.c b/source/blender/editors/io/io_alembic.c
index a991f59e8e2..3f9eb33e239 100644
--- a/source/blender/editors/io/io_alembic.c
+++ b/source/blender/editors/io/io_alembic.c
@@ -124,9 +124,9 @@ static int wm_alembic_export_exec(bContext *C, wmOperator *op)
 	    .use_subdiv_schema = RNA_boolean_get(op->ptr, "subdiv_schema"),
 	    .compression_type = RNA_enum_get(op->ptr, "compression_type"),
 	    .packuv = RNA_boolean_get(op->ptr, "packuv"),
-		.triangulate = RNA_boolean_get(op->ptr, "triangulate"),
-		.quad_method = RNA_enum_get(op->ptr, "quad_method"),
-		.ngon_method = RNA_enum_get(op->ptr, "ngon_method"),
+	    .triangulate = RNA_boolean_get(op->ptr, "triangulate"),
+	    .quad_method = RNA_enum_get(op->ptr, "quad_method"),
+	    .ngon_method = RNA_enum_get(op->ptr, "ngon_method"),
 
 	    .global_scale = RNA_float_get(op->ptr, "global_scale"),
 	};
diff --git a/source/blender/editors/io/io_collada.c b/source/blender/editors/io/io_collada.c
index baae92f962e..b8794afbf85 100644
--- a/source/blender/editors/io/io_collada.c
+++ b/source/blender/editors/io/io_collada.c
@@ -97,7 +97,9 @@ static int wm_collada_export_exec(bContext *C, wmOperator *op)
 	int use_blender_profile;
 	int sort_by_name;
 	int export_transformation_type;
+
 	int open_sim;
+	int keep_bind_info;
 
 	int export_count;
 
@@ -148,6 +150,8 @@ static int wm_collada_export_exec(bContext *C, wmOperator *op)
 	export_transformation_type = RNA_enum_get(op->ptr,    "export_transformation_type_selection");
 	open_sim                   = RNA_boolean_get(op->ptr, "open_sim");
 
+	keep_bind_info = RNA_boolean_get(op->ptr, "keep_bind_info");
+
 	/* get editmode results */
 	ED_object_editmode_load(CTX_data_edit_object(C));
 
@@ -172,7 +176,10 @@ static int wm_collada_export_exec(bContext *C, wmOperator *op)
 		use_blender_profile,
 		sort_by_name,
 		export_transformation_type,
-		open_sim);
+
+		open_sim,
+		keep_bind_info
+	);
 
 	if (export_count == 0) {
 		BKE_report(op->reports, RPT_WARNING, "No objects selected -- Created empty export file");
@@ -256,11 +263,11 @@ static void uiCollada_exportSettings(uiLayout *layout, PointerRNA *imfptr)
 	uiItemL(row, IFACE_("Collada Options:"), ICON_MODIFIER);
 
 	row = uiLayoutRow(box, false);
-	uiItemR(row, imfptr, "triangulate", 0, NULL, ICON_NONE);
+	uiItemR(row, imfptr, "triangulate", 1, NULL, ICON_NONE);
 	row = uiLayoutRow(box, false);
-	uiItemR(row, imfptr, "use_object_instantiation", 0, NULL, ICON_NONE);
+	uiItemR(row, imfptr, "use_object_instantiation", 1, NULL, ICON_NONE);
 	row = uiLayoutRow(box, false);
-	uiItemR(row, imfptr, "use_blender_profile", 0, NULL, ICON_NONE);
+	uiItemR(row, imfptr, "use_blender_profile", 1, NULL, ICON_NONE);
 
 	row = uiLayoutRow(box, false);
 	split = uiLayoutSplit(row, 0.6f, UI_LAYOUT_ALIGN_RIGHT);
@@ -269,6 +276,8 @@ static void uiCollada_exportSettings(uiLayout *layout, PointerRNA *imfptr)
 
 	row = uiLayoutRow(box, false);
 	uiItemR(row, imfptr, "sort_by_name", 0, NULL, ICON_NONE);
+	row = uiLayoutRow(box, false);
+	uiItemR(row, imfptr, "keep_bind_info", 0, NULL, ICON_NONE);
 
 }
 
@@ -296,6 +305,8 @@ static bool wm_collada_export_check(bContext *UNUSED(C), wmOperator *op)
 
 void WM_OT_collada_export(wmOperatorType *ot)
 {
+	struct StructRNA *func = ot->srna;
+
 	static EnumPropertyItem prop_bc_export_mesh_type[] = {
 		{BC_MESH_TYPE_VIEW, "view", 0, "View", "Apply modifier's view settings"},
 		{BC_MESH_TYPE_RENDER, "render", 0, "Render", "Apply modifier's render settings"},
@@ -325,65 +336,69 @@ void WM_OT_collada_export(wmOperatorType *ot)
 	        ot, FILE_TYPE_FOLDER | FILE_TYPE_COLLADA, FILE_BLENDER, FILE_SAVE,
 	        WM_FILESEL_FILEPATH, FILE_DEFAULTDISPLAY, FILE_SORT_ALPHA);
 
-	RNA_def_boolean(ot->srna,
+	RNA_def_boolean(func,
 	                "apply_modifiers", 0, "Apply Modifiers",
 	                "Apply modifiers to exported mesh (non destructive))");
 
-	RNA_def_int(ot->srna, "export_mesh_type", 0, INT_MIN, INT_MAX,
+	RNA_def_int(func, "export_mesh_type", 0, INT_MIN, INT_MAX,
 	            "Resolution", "Modifier resolution for export", INT_MIN, INT_MAX);
 
-	RNA_def_enum(ot->srna, "export_mesh_type_selection", prop_bc_export_mesh_type, 0,
+	RNA_def_enum(func, "export_mesh_type_selection", prop_bc_export_mesh_type, 0,
 	             "Resolution", "Modifier resolution for export");
 
-	RNA_def_boolean(ot->srna, "selected", 0, "Selection Only",
+	RNA_def_boolean(func, "selected", 0, "Selection Only",
 	                "Export only selected elements");
 
-	RNA_def_boolean(ot->srna, "include_children", 0, "Include Children",
+	RNA_def_boolean(func, "include_children", 0, "Include Children",
 	                "Export all children of selected objects (even if not selected)");
 
-	RNA_def_boolean(ot->srna, "include_armatures", 0, "Include Armatures",
+	RNA_def_boolean(func, "include_armatures", 0, "Include Armatures",
 	                "Export related armatures (even if not selected)");
 
-	RNA_def_boolean(ot->srna, "include_shapekeys", 1, "Include Shape Keys",
+	RNA_def_boolean(func, "include_shapekeys", 1, "Include Shape Keys",
 	                "Export all Shape Keys from Mesh Objects");
 
-	RNA_def_boolean(ot->srna, "deform_bones_only", 0, "Deform Bones only",
+	RNA_def_boolean(func, "deform_bones_only", 0, "Deform Bones only",
 	                "Only export deforming bones with armatures");
 
 
-	RNA_def_boolean(ot->srna, "active_uv_only", 0, "Only Selected UV Map",
+	RNA_def_boolean(func, "active_uv_only", 0, "Only Selected UV Map",
 	                "Export only the selected UV Map");
 
-	RNA_def_boolean(ot->srna, "include_uv_textures", 0, "Include UV Textures",
+	RNA_def_boolean(func, "include_uv_textures", 0, "Include UV Textures",
 	                "Export textures assigned to the object UV Maps");
 
-	RNA_def_boolean(ot->srna, "include_material_textures", 0, "Include Material Textures",
+	RNA_def_boolean(func, "include_material_textures", 0, "Include Material Textures",
 	                "Export textures assigned to the object Materials");
 
-	RNA_def_boolean(ot->srna, "use_texture_copies", 1, "Copy",
+	RNA_def_boolean(func, "use_texture_copies", 1, "Copy",
 	                "Copy textures to same folder where the .dae file is exported");
 
 
-	RNA_def_boolean(ot->srna, "triangulate", 1, "Triangulate",
+	RNA_def_boolean(func, "triangulate", 1, "Triangulate",
 	                "Export Polygons (Quads & NGons) as Triangles");
 
-	RNA_def_boolean(ot->srna, "use_object_instantiation", 1, "Use Object Instances",
+	RNA_def_boolean(func, "use_object_instantiation", 1, "Use Object Instances",
 		"Instantiate multiple Objects from same Data");
 
-	RNA_def_boolean(ot->srna, "use_blender_profile", 1, "Use Blender Profile",
+	RNA_def_boolean(func, "use_blender_profile", 1, "Use Blender Profile",
 		"Export additional Blender specific information (for material, shaders, bones, etc.)");
 
-	RNA_def_boolean(ot->srna, "sort_by_name", 0, "Sort by Object name",
+	RNA_def_boolean(func, "sort_by_name", 0, "Sort by Object name",
 	                "Sort exported data by Object name");
 
-	RNA_def_int(ot->srna, "export_transformation_type", 0, INT_MIN, INT_MAX,
+	RNA_def_int(func, "export_transformation_type", 0, INT_MIN, INT_MAX,
 	            "Transform", "Transformation type for translation, scale and rotation", INT_MIN, INT_MAX);
 
-	RNA_def_enum(ot->srna, "export_transformation_type_selection", prop_bc_export_transformation_type, 0,
+	RNA_def_enum(func, "export_transformation_type_selection", prop_bc_export_transformation_type, 0,
 	             "Transform", "Transformation type for translation, scale and rotation");
 
-	RNA_def_boolean(ot->srna, "open_sim", 0, "Export to SL/OpenSim",
+	RNA_def_boolean(func, "open_sim", 0, "Export to SL/OpenSim",
 	                "Compatibility mode for SL, OpenSim and other compatible online worlds");
+
+	RNA_def_boolean(func, "keep_bind_info", 0,
+		"Keep Bind Info", "Store Bindpose information in custom bone properties for later use during Collada export");
+
 }
 
 
@@ -395,7 +410,9 @@ static int wm_collada_import_exec(bContext *C, wmOperator *op)
 	int find_chains;
 	int auto_connect;
 	int fix_orientation;
-	int  min_chain_length;
+	int min_chain_length;
+
+	int keep_bind_info;
 
 	if (!RNA_struct_property_is_set(op->ptr, "filepath")) {
 		BKE_report(op->reports, RPT_ERROR, "No filename given");
@@ -407,6 +424,9 @@ static int wm_collada_import_exec(bContext *C, wmOperator *op)
 	find_chains      = RNA_boolean_get(op->ptr, "find_chains");
 	auto_connect     = RNA_boolean_get(op->ptr, "auto_connect");
 	fix_orientation  = RNA_boolean_get(op->ptr, "fix_orientation");
+
+	keep_bind_info = RNA_boolean_get(op->ptr, "keep_bind_info");
+
 	min_chain_length = RNA_int_get(op->ptr, "min_chain_length");
 
 	RNA_string_get(op->ptr, "filepath", filename);
@@ -416,7 +436,8 @@ static int wm_collada_import_exec(bContext *C, wmOperator *op)
 	        find_chains,
 	        auto_connect,
 	        fix_orientation,
-	        min_chain_length))
+	        min_chain_length,
+	        keep_bind_info) )
 	{
 		return OPERATOR_FINISHED;
 	}
@@ -453,6 +474,13 @@ static void uiCollada_importSettings(uiLayout *layout, PointerRNA *imfptr)
 
 	row = uiLayoutRow(box, false);
 	uiItemR(row, imfptr, "min_chain_length", 0, NULL, ICON_NONE);
+
+	box = uiLayoutBox(layout);
+	row = uiLayoutRow(box, false);
+
+	row = uiLayoutRow(box, false);
+	uiItemR(row, imfptr, "keep_bind_info", 0, NULL, ICON_NONE);
+
 }
 
 static void wm_collada_import_draw(bContext *UNUSED(C), wmOperator *op)
@@ -508,5 +536,9 @@ void WM_OT_collada_import(wmOperatorType *ot)
 		0,
 		INT_MAX);
 
+	RNA_def_boolean(ot->srna, 
+		"keep_bind_info", 0, "Keep Bind Info", 
+		"Store Bindpose information in custom bone properties for later use during Collada export");
+
 }
 #endif
diff --git a/source/blender/editors/mesh/editmesh_intersect.c b/source/blender/editors/mesh/editmesh_intersect.c
index de93211bec4..49bfde77032 100644
--- a/source/blender/editors/mesh/editmesh_intersect.c
+++ b/source/blender/editors/mesh/editmesh_intersect.c
@@ -51,6 +51,7 @@
 #include "mesh_intern.h"  /* own include */
 
 #include "tools/bmesh_intersect.h"
+#include "tools/bmesh_separate.h"
 
 
 /* detect isolated holes and fill them */
@@ -137,6 +138,12 @@ enum {
 	ISECT_SEL_UNSEL     = 1,
 };
 
+enum {
+	ISECT_SEPARATE_ALL           = 0,
+	ISECT_SEPARATE_CUT           = 1,
+	ISECT_SEPARATE_NONE          = 2,
+};
+
 static int edbm_intersect_exec(bContext *C, wmOperator *op)
 {
 	Object *obedit = CTX_data_edit_object(C);
@@ -144,7 +151,9 @@ static int edbm_intersect_exec(bContext *C, wmOperator *op)
 	BMesh *bm = em->bm;
 	const int mode = RNA_enum_get(op->ptr, "mode");
 	int (*test_fn)(BMFace *, void *);
-	bool use_separate = RNA_boolean_get(op->ptr, "use_separate");
+	bool use_separate_all = false;
+	bool use_separate_cut = false;
+	const int separate_mode = RNA_enum_get(op->ptr, "separate_mode");
 	const float eps = RNA_float_get(op->ptr, "threshold");
 	bool use_self;
 	bool has_isect;
@@ -160,15 +169,38 @@ static int edbm_intersect_exec(bContext *C, wmOperator *op)
 			break;
 	}
 
+	switch (separate_mode) {
+		case ISECT_SEPARATE_ALL:
+			use_separate_all = true;
+			break;
+		case ISECT_SEPARATE_CUT:
+			if (use_self == false) {
+				use_separate_cut = true;
+			}
+			else {
+				/* we could support this but would require more advanced logic inside 'BM_mesh_intersect'
+				 * for now just separate all */
+				use_separate_all = true;
+			}
+			break;
+		default:  /* ISECT_SEPARATE_NONE */
+			break;
+	}
 
 	has_isect = BM_mesh_intersect(
 	        bm,
 	        em->looptris, em->tottri,
 	        test_fn, NULL,
-	        use_self, use_separate, true, true,
+	        use_self, use_separate_all, true, true, true,
 	        -1,
 	        eps);
 
+	if (use_separate_cut) {
+		/* detach selected/un-selected faces */
+		BM_mesh_separate_faces(
+		        bm,
+		        BM_elem_cb_check_hflag_enabled_simple(const BMFace *, BM_ELEM_SELECT));
+	}
 
 	if (has_isect) {
 		edbm_intersect_select(em);
@@ -190,6 +222,16 @@ void MESH_OT_intersect(struct wmOperatorType *ot)
 		{0, NULL, 0, NULL, NULL}
 	};
 
+	static EnumPropertyItem isect_separate_items[] = {
+		{ISECT_SEPARATE_ALL, "ALL", 0, "All",
+		 "Separate all geometry from intersections"},
+		{ISECT_SEPARATE_CUT, "CUT", 0, "Cut",
+		 "Cut into geometry keeping each side separate (Selected/Unselected only)"},
+		{ISECT_SEPARATE_NONE, "NONE", 0, "Merge",
+		 "Merge all geometry from the intersection"},
+		{0, NULL, 0, NULL, NULL}
+	};
+
 	/* identifiers */
 	ot->name = "Intersect (Knife)";
 	ot->description = "Cut an intersection into faces";
@@ -201,7 +243,7 @@ void MESH_OT_intersect(struct wmOperatorType *ot)
 
 	/* props */
 	RNA_def_enum(ot->srna, "mode", isect_mode_items, ISECT_SEL_UNSEL, "Source", "");
-	RNA_def_boolean(ot->srna, "use_separate", true, "Separate", "");
+	RNA_def_enum(ot->srna, "separate_mode", isect_separate_items, ISECT_SEPARATE_CUT, "Separate Mode", "");
 	RNA_def_float_distance(ot->srna, "threshold", 0.000001f, 0.0, 0.01, "Merge threshold", "", 0.0, 0.001);
 
 	/* flags */
@@ -239,7 +281,7 @@ static int edbm_intersect_boolean_exec(bContext *C, wmOperator *op)
 	        bm,
 	        em->looptris, em->tottri,
 	        test_fn, NULL,
-	        false, false, true, true,
+	        false, false, true, true, true,
 	        boolean_operation,
 	        eps);
 
diff --git a/source/blender/editors/mesh/editmesh_rip.c b/source/blender/editors/mesh/editmesh_rip.c
index e05ce727e22..1a2f9fdb62b 100644
--- a/source/blender/editors/mesh/editmesh_rip.c
+++ b/source/blender/editors/mesh/editmesh_rip.c
@@ -596,7 +596,7 @@ static int edbm_rip_invoke__vert(bContext *C, wmOperator *op, const wmEvent *eve
 		BMVert *v_new;
 
 		BLI_assert(l_sep->v == v);
-		v_new = bmesh_urmv_loop_region(bm, l_sep);
+		v_new = BM_face_loop_separate_multi_isolated(bm, l_sep);
 		BLI_assert(BM_vert_find_first_loop(v));
 
 		BM_vert_select_set(bm, v, false);
@@ -665,7 +665,7 @@ static int edbm_rip_invoke__vert(bContext *C, wmOperator *op, const wmEvent *eve
 
 		BM_vert_select_set(bm, v, false);
 
-		bmesh_vert_separate(bm, v, &vout, &vout_len, true);
+		bmesh_kernel_vert_separate(bm, v, &vout, &vout_len, true);
 
 		if (vout_len < 2) {
 			MEM_freeN(vout);
diff --git a/source/blender/editors/metaball/mball_edit.c b/source/blender/editors/metaball/mball_edit.c
index ed5bf4a92b4..bc42717b69f 100644
--- a/source/blender/editors/metaball/mball_edit.c
+++ b/source/blender/editors/metaball/mball_edit.c
@@ -592,12 +592,9 @@ bool ED_mball_select_pick(bContext *C, const int mval[2], bool extend, bool dese
 
 	view3d_set_viewcontext(C, &vc);
 
-	rect.xmin = mval[0] - 12;
-	rect.xmax = mval[0] + 12;
-	rect.ymin = mval[1] - 12;
-	rect.ymax = mval[1] + 12;
+	BLI_rcti_init_pt_radius(&rect, mval, 12);
 
-	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, true);
+	hits = view3d_opengl_select(&vc, buffer, MAXPICKBUF, &rect, VIEW3D_SELECT_PICK_NEAREST);
 
 	/* does startelem exist? */
 	ml = mb->editelems->first;
diff --git a/source/blender/editors/object/object_add.c b/source/blender/editors/object/object_add.c
index 02b2d8492b4..20523264cd9 100644
--- a/source/blender/editors/object/object_add.c
+++ b/source/blender/editors/object/object_add.c
@@ -64,6 +64,7 @@
 #include "BKE_armature.h"
 #include "BKE_camera.h"
 #include "BKE_context.h"
+#include "BKE_constraint.h"
 #include "BKE_curve.h"
 #include "BKE_depsgraph.h"
 #include "BKE_DerivedMesh.h"
@@ -1377,7 +1378,7 @@ static void make_object_duplilist_real(bContext *C, Scene *scene, Base *base,
 		ob->proxy = NULL;
 
 		ob->parent = NULL;
-		BLI_listbase_clear(&ob->constraints);
+		BKE_constraints_free(&ob->constraints);
 		ob->curve_cache = NULL;
 		ob->transflag &= ~OB_DUPLI;
 		ob->lay = base->lay;
@@ -1655,8 +1656,25 @@ static int convert_exec(bContext *C, wmOperator *op)
 		}
 	}
 
-	CTX_DATA_BEGIN (C, Base *, base, selected_editable_bases)
+	ListBase selected_editable_bases = CTX_data_collection_get(C, "selected_editable_bases");
+
+	/* Ensure we get all meshes calculated with a sufficient data-mask,
+	 * needed since re-evaluating single modifiers causes bugs if they depend
+	 * on other objects data masks too, see: T50950. */
 	{
+		for (CollectionPointerLink *link = selected_editable_bases.first; link; link = link->next) {
+			Base *base = link->ptr.data;
+			DAG_id_tag_update(&base->object->id, OB_RECALC_DATA);
+		}
+
+		uint64_t customdata_mask_prev = scene->customdata_mask;
+		scene->customdata_mask |= CD_MASK_MESH;
+		BKE_scene_update_tagged(bmain->eval_ctx, bmain, scene);
+		scene->customdata_mask = customdata_mask_prev;
+	}
+
+	for (CollectionPointerLink *link = selected_editable_bases.first; link; link = link->next) {
+		Base *base = link->ptr.data;
 		ob = base->object;
 
 		if (ob->flag & OB_DONE || !IS_TAGGED(ob->data)) {
@@ -1699,7 +1717,7 @@ static int convert_exec(bContext *C, wmOperator *op)
 				ED_rigidbody_object_remove(bmain, scene, newob);
 			}
 		}
-		else if (ob->type == OB_MESH && ob->modifiers.first) { /* converting a mesh with no modifiers causes a segfault */
+		else if (ob->type == OB_MESH) {
 			ob->flag |= OB_DONE;
 
 			if (keep_original) {
@@ -1723,7 +1741,6 @@ static int convert_exec(bContext *C, wmOperator *op)
 			 * cases this doesnt give correct results (when MDEF is used for eg)
 			 */
 			dm = mesh_get_derived_final(scene, newob, CD_MASK_MESH);
-			// dm = mesh_create_derived_no_deform(ob1, NULL);  /* this was called original (instead of get_derived). man o man why! (ton) */
 
 			DM_to_mesh(dm, newob->data, newob, CD_MASK_MESH, true);
 
@@ -1888,7 +1905,7 @@ static int convert_exec(bContext *C, wmOperator *op)
 			((ID *)ob->data)->tag &= ~LIB_TAG_DOIT; /* flag not to convert this datablock again */
 		}
 	}
-	CTX_DATA_END;
+	BLI_freelistN(&selected_editable_bases);
 
 	if (!keep_original) {
 		if (mballConverted) {
diff --git a/source/blender/editors/object/object_bake_api.c b/source/blender/editors/object/object_bake_api.c
index fd95d6129ad..968081818a2 100644
--- a/source/blender/editors/object/object_bake_api.c
+++ b/source/blender/editors/object/object_bake_api.c
@@ -352,12 +352,17 @@ static bool is_noncolor_pass(ScenePassType pass_type)
 }
 
 /* if all is good tag image and return true */
-static bool bake_object_check(Object *ob, ReportList *reports)
+static bool bake_object_check(Scene *scene, Object *ob, ReportList *reports)
 {
 	Image *image;
 	void *lock;
 	int i;
 
+	if ((ob->lay & scene->lay) == 0) {
+		BKE_reportf(reports, RPT_ERROR, "Object \"%s\" is not on a scene layer", ob->id.name + 2);
+		return false;
+	}
+
 	if (ob->type != OB_MESH) {
 		BKE_reportf(reports, RPT_ERROR, "Object \"%s\" is not a mesh", ob->id.name + 2);
 		return false;
@@ -491,7 +496,7 @@ static bool bake_pass_filter_check(ScenePassType pass_type, const int pass_filte
 }
 
 /* before even getting in the bake function we check for some basic errors */
-static bool bake_objects_check(Main *bmain, Object *ob, ListBase *selected_objects,
+static bool bake_objects_check(Main *bmain, Scene *scene, Object *ob, ListBase *selected_objects,
                                ReportList *reports, const bool is_selected_to_active)
 {
 	CollectionPointerLink *link;
@@ -502,7 +507,7 @@ static bool bake_objects_check(Main *bmain, Object *ob, ListBase *selected_objec
 	if (is_selected_to_active) {
 		int tot_objects = 0;
 
-		if (!bake_object_check(ob, reports))
+		if (!bake_object_check(scene, ob, reports))
 			return false;
 
 		for (link = selected_objects->first; link; link = link->next) {
@@ -530,7 +535,7 @@ static bool bake_objects_check(Main *bmain, Object *ob, ListBase *selected_objec
 		}
 
 		for (link = selected_objects->first; link; link = link->next) {
-			if (!bake_object_check(link->ptr.data, reports))
+			if (!bake_object_check(scene, link->ptr.data, reports))
 				return false;
 		}
 	}
@@ -619,7 +624,7 @@ static Mesh *bake_mesh_new_from_object(Main *bmain, Scene *scene, Object *ob)
 		ED_object_editmode_load(ob);
 
 	Mesh *me = BKE_mesh_new_from_object(bmain, scene, ob, 1, 2, 0, 0);
-	BKE_mesh_split_faces(me);
+	BKE_mesh_split_faces(me, true);
 
 	return me;
 }
@@ -1179,7 +1184,7 @@ static int bake_exec(bContext *C, wmOperator *op)
 		goto finally;
 	}
 
-	if (!bake_objects_check(bkr.main, bkr.ob, &bkr.selected_objects, bkr.reports, bkr.is_selected_to_active)) {
+	if (!bake_objects_check(bkr.main, bkr.scene, bkr.ob, &bkr.selected_objects, bkr.reports, bkr.is_selected_to_active)) {
 		goto finally;
 	}
 
@@ -1237,7 +1242,7 @@ static void bake_startjob(void *bkv, short *UNUSED(stop), short *do_update, floa
 		return;
 	}
 
-	if (!bake_objects_check(bkr->main, bkr->ob, &bkr->selected_objects, bkr->reports, bkr->is_selected_to_active)) {
+	if (!bake_objects_check(bkr->main, bkr->scene, bkr->ob, &bkr->selected_objects, bkr->reports, bkr->is_selected_to_active)) {
 		bkr->result = OPERATOR_CANCELLED;
 		return;
 	}
diff --git a/source/blender/editors/object/object_intern.h b/source/blender/editors/object/object_intern.h
index 9710e4f843d..b8957bdedf9 100644
--- a/source/blender/editors/object/object_intern.h
+++ b/source/blender/editors/object/object_intern.h
@@ -186,6 +186,7 @@ void OBJECT_OT_skin_loose_mark_clear(struct wmOperatorType *ot);
 void OBJECT_OT_skin_radii_equalize(struct wmOperatorType *ot);
 void OBJECT_OT_skin_armature_create(struct wmOperatorType *ot);
 void OBJECT_OT_laplaciandeform_bind(struct wmOperatorType *ot);
+void OBJECT_OT_surfacedeform_bind(struct wmOperatorType *ot);
 
 /* object_constraint.c */
 void OBJECT_OT_constraint_add(struct wmOperatorType *ot);
diff --git a/source/blender/editors/object/object_modifier.c b/source/blender/editors/object/object_modifier.c
index 06f495fb9f1..06d306ded42 100644
--- a/source/blender/editors/object/object_modifier.c
+++ b/source/blender/editors/object/object_modifier.c
@@ -1484,7 +1484,6 @@ static int skin_root_mark_exec(bContext *C, wmOperator *UNUSED(op))
 	Object *ob = CTX_data_edit_object(C);
 	BMEditMesh *em = BKE_editmesh_from_object(ob);
 	BMesh *bm = em->bm;
-	const int cd_vert_skin_offset = CustomData_get_offset(&bm->vdata, CD_MVERT_SKIN);
 	BMVert *bm_vert;
 	BMIter bm_iter;
 	GSet *visited;
@@ -1493,6 +1492,8 @@ static int skin_root_mark_exec(bContext *C, wmOperator *UNUSED(op))
 
 	BKE_mesh_ensure_skin_customdata(ob->data);
 
+	const int cd_vert_skin_offset = CustomData_get_offset(&bm->vdata, CD_MVERT_SKIN);
+
 	BM_ITER_MESH (bm_vert, &bm_iter, bm, BM_VERTS_OF_MESH) {
 		if (BM_elem_flag_test(bm_vert, BM_ELEM_SELECT) &&
 		    BLI_gset_add(visited, bm_vert))
@@ -2294,3 +2295,56 @@ void OBJECT_OT_laplaciandeform_bind(wmOperatorType *ot)
 	ot->flag = OPTYPE_REGISTER | OPTYPE_UNDO | OPTYPE_INTERNAL;
 	edit_modifier_properties(ot);
 }
+
+/************************ sdef bind operator *********************/
+
+static int surfacedeform_bind_poll(bContext *C)
+{
+	return edit_modifier_poll_generic(C, &RNA_SurfaceDeformModifier, 0);
+}
+
+static int surfacedeform_bind_exec(bContext *C, wmOperator *op)
+{
+	Object *ob = ED_object_active_context(C);
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)edit_modifier_property_get(op, ob, eModifierType_SurfaceDeform);
+
+	if (!smd)
+		return OPERATOR_CANCELLED;
+
+	if (smd->flags & MOD_SDEF_BIND) {
+		smd->flags &= ~MOD_SDEF_BIND;
+	}
+	else if (smd->target) {
+		smd->flags |= MOD_SDEF_BIND;
+	}
+
+	DAG_id_tag_update(&ob->id, OB_RECALC_DATA);
+	WM_event_add_notifier(C, NC_OBJECT | ND_MODIFIER, ob);
+
+	return OPERATOR_FINISHED;
+}
+
+static int surfacedeform_bind_invoke(bContext *C, wmOperator *op, const wmEvent *UNUSED(event))
+{
+	if (edit_modifier_invoke_properties(C, op))
+		return surfacedeform_bind_exec(C, op);
+	else
+		return OPERATOR_CANCELLED;
+}
+
+void OBJECT_OT_surfacedeform_bind(wmOperatorType *ot)
+{
+	/* identifiers */
+	ot->name = "Surface Deform Bind";
+	ot->description = "Bind mesh to target in surface deform modifier";
+	ot->idname = "OBJECT_OT_surfacedeform_bind";
+
+	/* api callbacks */
+	ot->poll = surfacedeform_bind_poll;
+	ot->invoke = surfacedeform_bind_invoke;
+	ot->exec = surfacedeform_bind_exec;
+
+	/* flags */
+	ot->flag = OPTYPE_REGISTER | OPTYPE_UNDO | OPTYPE_INTERNAL;
+	edit_modifier_properties(ot);
+}
diff --git a/source/blender/editors/object/object_ops.c b/source/blender/editors/object/object_ops.c
index 7e7e1ef182c..5fe5a884354 100644
--- a/source/blender/editors/object/object_ops.c
+++ b/source/blender/editors/object/object_ops.c
@@ -255,6 +255,7 @@ void ED_operatortypes_object(void)
 
 	WM_operatortype_append(OBJECT_OT_data_transfer);
 	WM_operatortype_append(OBJECT_OT_datalayout_transfer);
+	WM_operatortype_append(OBJECT_OT_surfacedeform_bind);
 }
 
 void ED_operatormacros_object(void)
diff --git a/source/blender/editors/physics/physics_ops.c b/source/blender/editors/physics/physics_ops.c
index 0c907f19753..b1d708ebc07 100644
--- a/source/blender/editors/physics/physics_ops.c
+++ b/source/blender/editors/physics/physics_ops.c
@@ -138,13 +138,21 @@ static void keymap_particle(wmKeyConfig *keyconf)
 	RNA_boolean_set(kmi->ptr, "unselected", true);
 
 	/* Shift+LMB behavior first, so it has priority over KM_ANY item below. */
-	kmi = WM_keymap_verify_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
+	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
 	RNA_boolean_set(kmi->ptr, "release_confirm", true);
 	RNA_boolean_set(kmi->ptr, "use_planar_constraint", true);
+	RNA_boolean_set(kmi->ptr, "use_accurate", false);
+
+	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
+	RNA_boolean_set(kmi->ptr, "release_confirm", true);
+	RNA_boolean_set(kmi->ptr, "use_planar_constraint", false);
+	RNA_boolean_set(kmi->ptr, "use_accurate", true);
+
 	/* Using KM_ANY here to allow holding modifiers before starting to transform. */
 	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_ANY, 0);
 	RNA_boolean_set(kmi->ptr, "release_confirm", true);
 	RNA_boolean_set(kmi->ptr, "use_planar_constraint", false);
+	RNA_boolean_set(kmi->ptr, "use_accurate", false);
 
 	WM_keymap_add_item(keymap, "PARTICLE_OT_brush_edit", LEFTMOUSE, KM_PRESS, 0, 0);
 	WM_keymap_add_item(keymap, "PARTICLE_OT_brush_edit", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
diff --git a/source/blender/editors/render/render_opengl.c b/source/blender/editors/render/render_opengl.c
index 9097432a251..1d0f433ba38 100644
--- a/source/blender/editors/render/render_opengl.c
+++ b/source/blender/editors/render/render_opengl.c
@@ -315,7 +315,7 @@ static void screen_opengl_render_doit(OGLRender *oglrender, RenderResult *rr)
 			RE_render_result_rect_from_ibuf(rr, &scene->r, out, oglrender->view_id);
 			IMB_freeImBuf(out);
 		}
-		else if (gpd){
+		else if (gpd) {
 			/* If there are no strips, Grease Pencil still needs a buffer to draw on */
 			ImBuf *out = IMB_allocImBuf(oglrender->sizex, oglrender->sizey, 32, IB_rect);
 			RE_render_result_rect_from_ibuf(rr, &scene->r, out, oglrender->view_id);
@@ -715,7 +715,6 @@ static bool screen_opengl_render_init(bContext *C, wmOperator *op)
 			oglrender->task_scheduler = task_scheduler;
 			oglrender->task_pool = BLI_task_pool_create_background(task_scheduler,
 			                                                       oglrender);
-			BLI_pool_set_num_threads(oglrender->task_pool, 1);
 		}
 		else {
 			oglrender->task_scheduler = NULL;
@@ -747,6 +746,23 @@ static void screen_opengl_render_end(bContext *C, OGLRender *oglrender)
 	int i;
 
 	if (oglrender->is_animation) {
+		/* Trickery part for movie output:
+		 *
+		 * We MUST write frames in an exact order, so we only let background
+		 * thread to work on that, and main thread is simply waits for that
+		 * thread to do all the dirty work.
+		 *
+		 * After this loop is done work_and_wait() will have nothing to do,
+		 * so we don't run into wrong order of frames written to the stream.
+		 */
+		if (BKE_imtype_is_movie(scene->r.im_format.imtype)) {
+			BLI_mutex_lock(&oglrender->task_mutex);
+			while (oglrender->num_scheduled_frames > 0) {
+				BLI_condition_wait(&oglrender->task_condition,
+				                   &oglrender->task_mutex);
+			}
+			BLI_mutex_unlock(&oglrender->task_mutex);
+		}
 		BLI_task_pool_work_and_wait(oglrender->task_pool);
 		BLI_task_pool_free(oglrender->task_pool);
 		/* Depending on various things we might or might not use global scheduler. */
@@ -886,14 +902,15 @@ static void write_result_func(TaskPool * __restrict pool,
 	 */
 	ReportList reports;
 	BKE_reports_init(&reports, oglrender->reports->flag & ~RPT_PRINT);
-	/* Do actual save logic here, depending on the file format. */
+	/* Do actual save logic here, depending on the file format.
+	 *
+	 * NOTE: We have to construct temporary scene with proper scene->r.cfra.
+	 * This is because underlying calls do not use r.cfra but use scene
+	 * for that.
+	 */
+	Scene tmp_scene = *scene;
+	tmp_scene.r.cfra = cfra;
 	if (is_movie) {
-		/* We have to construct temporary scene with proper scene->r.cfra.
-		 * This is because underlying calls do not use r.cfra but use scene
-		 * for that.
-		 */
-		Scene tmp_scene = *scene;
-		tmp_scene.r.cfra = cfra;
 		ok = RE_WriteRenderViewsMovie(&reports,
 		                              rr,
 		                              &tmp_scene,
@@ -917,8 +934,8 @@ static void write_result_func(TaskPool * __restrict pool,
 		                             true,
 		                             NULL);
 
-		BKE_render_result_stamp_info(scene, scene->camera, rr, false);
-		ok = RE_WriteRenderViewsImage(NULL, rr, scene, true, name);
+		BKE_render_result_stamp_info(&tmp_scene, tmp_scene.camera, rr, false);
+		ok = RE_WriteRenderViewsImage(NULL, rr, &tmp_scene, true, name);
 		if (!ok) {
 			BKE_reportf(&reports,
 			            RPT_ERROR,
diff --git a/source/blender/editors/render/render_preview.c b/source/blender/editors/render/render_preview.c
index 87c08dc6583..fa18b82507e 100644
--- a/source/blender/editors/render/render_preview.c
+++ b/source/blender/editors/render/render_preview.c
@@ -196,7 +196,7 @@ static Main *load_main_from_memory(const void *blend, int blend_size)
 	BlendFileData *bfd;
 
 	G.fileflags |= G_FILE_NO_UI;
-	bfd = BLO_read_from_memory(blend, blend_size, NULL);
+	bfd = BLO_read_from_memory(blend, blend_size, NULL, BLO_READ_SKIP_NONE);
 	if (bfd) {
 		bmain = bfd->main;
 
diff --git a/source/blender/editors/screen/area.c b/source/blender/editors/screen/area.c
index e6bb604d387..9cfaf3b4c1d 100644
--- a/source/blender/editors/screen/area.c
+++ b/source/blender/editors/screen/area.c
@@ -1749,7 +1749,7 @@ int ED_area_header_switchbutton(const bContext *C, uiBlock *block, int yco)
 
 	RNA_pointer_create(&(scr->id), &RNA_Area, sa, &areaptr);
 
-	uiDefButR(block, UI_BTYPE_MENU, 0, "", xco, yco, 1.5 * U.widget_unit, U.widget_unit,
+	uiDefButR(block, UI_BTYPE_MENU, 0, "", xco, yco, 1.6 * U.widget_unit, U.widget_unit,
 	          &areaptr, "type", 0, 0.0f, 0.0f, 0.0f, 0.0f, "");
 
 	return xco + 1.7 * U.widget_unit;
diff --git a/source/blender/editors/screen/screen_ops.c b/source/blender/editors/screen/screen_ops.c
index a7a0a240259..c2e094fc161 100644
--- a/source/blender/editors/screen/screen_ops.c
+++ b/source/blender/editors/screen/screen_ops.c
@@ -2808,7 +2808,7 @@ static int screen_area_options_invoke(bContext *C, wmOperator *op, const wmEvent
 	bScreen *sc = CTX_wm_screen(C);
 	uiPopupMenu *pup;
 	uiLayout *layout;
-	PointerRNA ptr1, ptr2;
+	PointerRNA ptr;
 	ScrEdge *actedge;
 	const int winsize_x = WM_window_pixels_x(win);
 	const int winsize_y = WM_window_pixels_y(win);
@@ -2820,22 +2820,17 @@ static int screen_area_options_invoke(bContext *C, wmOperator *op, const wmEvent
 	pup = UI_popup_menu_begin(C, RNA_struct_ui_name(op->type->srna), ICON_NONE);
 	layout = UI_popup_menu_layout(pup);
 	
-	WM_operator_properties_create(&ptr1, "SCREEN_OT_area_join");
-	
-	/* mouse cursor on edge, '4' can fail on wide edges... */
-	RNA_int_set(&ptr1, "min_x", event->x + 4);
-	RNA_int_set(&ptr1, "min_y", event->y + 4);
-	RNA_int_set(&ptr1, "max_x", event->x - 4);
-	RNA_int_set(&ptr1, "max_y", event->y - 4);
-	
-	WM_operator_properties_create(&ptr2, "SCREEN_OT_area_split");
-	
+	ptr = uiItemFullO(layout, "SCREEN_OT_area_split", NULL, ICON_NONE, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 	/* store initial mouse cursor position */
-	RNA_int_set(&ptr2, "mouse_x", event->x);
-	RNA_int_set(&ptr2, "mouse_y", event->y);
-	
-	uiItemFullO(layout, "SCREEN_OT_area_split", NULL, ICON_NONE, ptr2.data, WM_OP_INVOKE_DEFAULT, 0);
-	uiItemFullO(layout, "SCREEN_OT_area_join", NULL, ICON_NONE, ptr1.data, WM_OP_INVOKE_DEFAULT, 0);
+	RNA_int_set(&ptr, "mouse_x", event->x);
+	RNA_int_set(&ptr, "mouse_y", event->y);
+
+	ptr = uiItemFullO(layout, "SCREEN_OT_area_join", NULL, ICON_NONE, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+	/* mouse cursor on edge, '4' can fail on wide edges... */
+	RNA_int_set(&ptr, "min_x", event->x + 4);
+	RNA_int_set(&ptr, "min_y", event->y + 4);
+	RNA_int_set(&ptr, "max_x", event->x - 4);
+	RNA_int_set(&ptr, "max_y", event->y - 4);
 	
 	UI_popup_menu_end(C, pup);
 	
@@ -2903,10 +2898,23 @@ static void SCREEN_OT_spacedata_cleanup(wmOperatorType *ot)
 
 static int repeat_last_exec(bContext *C, wmOperator *UNUSED(op))
 {
-	wmOperator *lastop = CTX_wm_manager(C)->operators.last;
-	
-	if (lastop)
+	wmWindowManager *wm = CTX_wm_manager(C);
+	wmOperator *lastop = wm->operators.last;
+
+	/* Seek last registered operator */
+	while (lastop) {
+		if (lastop->type->flag & OPTYPE_REGISTER) {
+			break;
+		}
+		else {
+			lastop = lastop->prev;
+		}
+	}
+
+	if (lastop) {
+		WM_operator_free_all_after(wm, lastop);
 		WM_operator_repeat(C, lastop);
+	}
 	
 	return OPERATOR_CANCELLED;
 }
@@ -2941,8 +2949,9 @@ static int repeat_history_invoke(bContext *C, wmOperator *op, const wmEvent *UNU
 	layout = UI_popup_menu_layout(pup);
 	
 	for (i = items - 1, lastop = wm->operators.last; lastop; lastop = lastop->prev, i--)
-		if (WM_operator_repeat_check(C, lastop))
+		if ((lastop->type->flag & OPTYPE_REGISTER) && WM_operator_repeat_check(C, lastop)) {
 			uiItemIntO(layout, RNA_struct_ui_name(lastop->type->srna), ICON_NONE, op->type->idname, "index", i);
+		}
 	
 	UI_popup_menu_end(C, pup);
 	
diff --git a/source/blender/editors/sculpt_paint/paint_image_proj.c b/source/blender/editors/sculpt_paint/paint_image_proj.c
index f5d115442c6..d0f1cc99b8d 100644
--- a/source/blender/editors/sculpt_paint/paint_image_proj.c
+++ b/source/blender/editors/sculpt_paint/paint_image_proj.c
@@ -5711,21 +5711,16 @@ static bool proj_paint_add_slot(bContext *C, wmOperator *op)
 			/* successful creation of mtex layer, now create set */
 			if (mtex) {
 				int type = MAP_COL;
-				int type_id = 0;
+				char imagename_buff[MAX_ID_NAME - 2];
+				const char *imagename = DATA_("Diffuse Color");
 
 				if (op) {
-					int i;
 					type = RNA_enum_get(op->ptr, "type");
-
-					for (i = 0; i < ARRAY_SIZE(layer_type_items); i++) {
-						if (layer_type_items[i].value == type) {
-							type_id = i;
-							break;
-						}
-					}
+					RNA_string_get(op->ptr, "name", imagename_buff);
+					imagename = imagename_buff;
 				}
 
-				mtex->tex = BKE_texture_add(bmain, DATA_(layer_type_items[type_id].name));
+				mtex->tex = BKE_texture_add(bmain, imagename);
 				mtex->mapto = type;
 
 				if (mtex->tex) {
diff --git a/source/blender/editors/sculpt_paint/sculpt.c b/source/blender/editors/sculpt_paint/sculpt.c
index 84e98181dfb..44cc2720a32 100644
--- a/source/blender/editors/sculpt_paint/sculpt.c
+++ b/source/blender/editors/sculpt_paint/sculpt.c
@@ -5361,8 +5361,12 @@ static int sculpt_mode_toggle_exec(bContext *C, wmOperator *op)
 		if (mmd)
 			multires_force_update(ob);
 
-		if (flush_recalc || (ob->sculpt && ob->sculpt->bm))
+		/* Always for now, so leaving sculpt mode always ensures scene is in
+		 * a consistent state.
+		 */
+		if (true || flush_recalc || (ob->sculpt && ob->sculpt->bm)) {
 			DAG_id_tag_update(&ob->id, OB_RECALC_DATA);
+		}
 
 		if (me->flag & ME_SCULPT_DYNAMIC_TOPOLOGY) {
 			/* Dynamic topology must be disabled before exiting sculpt
diff --git a/source/blender/editors/space_clip/tracking_ops.c b/source/blender/editors/space_clip/tracking_ops.c
index d28cbe5fb1d..169eb76399b 100644
--- a/source/blender/editors/space_clip/tracking_ops.c
+++ b/source/blender/editors/space_clip/tracking_ops.c
@@ -1534,7 +1534,8 @@ static int join_tracks_exec(bContext *C, wmOperator *op)
 				update_stabilization = true;
 				if ((act_track->flag & TRACK_USE_2D_STAB) == 0) {
 					act_track->flag |= TRACK_USE_2D_STAB;
-				} else {
+				}
+				else {
 					stab->tot_track--;
 				}
 				BLI_assert(0 <= stab->tot_track);
@@ -1543,7 +1544,8 @@ static int join_tracks_exec(bContext *C, wmOperator *op)
 				update_stabilization = true;
 				if ((act_track->flag & TRACK_USE_2D_STAB_ROT) == 0) {
 					act_track->flag |= TRACK_USE_2D_STAB_ROT;
-				} else {
+				}
+				else {
 					stab->tot_rot_track--;
 				}
 				BLI_assert(0 <= stab->tot_rot_track);
diff --git a/source/blender/editors/space_file/fsmenu.c b/source/blender/editors/space_file/fsmenu.c
index 631ff06a77a..71d49e0dc2e 100644
--- a/source/blender/editors/space_file/fsmenu.c
+++ b/source/blender/editors/space_file/fsmenu.c
@@ -169,12 +169,15 @@ void ED_fsmenu_entry_set_path(struct FSMenuEntry *fsentry, const char *path)
 
 static void fsmenu_entry_generate_name(struct FSMenuEntry *fsentry, char *name, size_t name_size)
 {
-	char temp[FILE_MAX];
+	int offset = 0;
+	int len = name_size;
 
-	BLI_strncpy(temp, fsentry->path, FILE_MAX);
-	BLI_add_slash(temp);
-	BLI_getlastdir(temp, name, name_size);
-	BLI_del_slash(name);
+	if (BLI_path_name_at_index(fsentry->path, -1, &offset, &len)) {
+		/* use as size */
+		len += 1;
+	}
+
+	BLI_strncpy(name, &fsentry->path[offset], MIN2(len, name_size));
 	if (!name[0]) {
 		name[0] = '/';
 		name[1] = '\0';
@@ -537,28 +540,22 @@ void fsmenu_read_system(struct FSMenu *fsmenu, int read_bookmarks)
 		/* Finally get user favorite places */
 		if (read_bookmarks) {
 			UInt32 seed;
-			OSErr err = noErr;
-			CFArrayRef pathesArray;
-			LSSharedFileListRef list;
-			LSSharedFileListItemRef itemRef;
-			CFIndex i, pathesCount;
-			CFURLRef cfURL = NULL;
-			CFStringRef pathString = NULL;
-			list = LSSharedFileListCreate(NULL, kLSSharedFileListFavoriteItems, NULL);
-			pathesArray = LSSharedFileListCopySnapshot(list, &seed);
-			pathesCount = CFArrayGetCount(pathesArray);
+			LSSharedFileListRef list = LSSharedFileListCreate(NULL, kLSSharedFileListFavoriteItems, NULL);
+			CFArrayRef pathesArray = LSSharedFileListCopySnapshot(list, &seed);
+			CFIndex pathesCount = CFArrayGetCount(pathesArray);
 			
-			for (i = 0; i < pathesCount; i++) {
-				itemRef = (LSSharedFileListItemRef)CFArrayGetValueAtIndex(pathesArray, i);
+			for (CFIndex i = 0; i < pathesCount; i++) {
+				LSSharedFileListItemRef itemRef = (LSSharedFileListItemRef)CFArrayGetValueAtIndex(pathesArray, i);
 				
-				err = LSSharedFileListItemResolve(itemRef, 
-				                                  kLSSharedFileListNoUserInteraction |
-				                                  kLSSharedFileListDoNotMountVolumes,
-				                                  &cfURL, NULL);
-				if (err != noErr)
+				CFURLRef cfURL = NULL;
+				OSErr err = LSSharedFileListItemResolve(itemRef, 
+				                                        kLSSharedFileListNoUserInteraction |
+				                                        kLSSharedFileListDoNotMountVolumes,
+				                                        &cfURL, NULL);
+				if (err != noErr || !cfURL)
 					continue;
 				
-				pathString = CFURLCopyFileSystemPath(cfURL, kCFURLPOSIXPathStyle);
+				CFStringRef pathString = CFURLCopyFileSystemPath(cfURL, kCFURLPOSIXPathStyle);
 				
 				if (pathString == NULL || !CFStringGetCString(pathString, line, sizeof(line), kCFStringEncodingUTF8))
 					continue;
diff --git a/source/blender/editors/space_graph/graph_buttons.c b/source/blender/editors/space_graph/graph_buttons.c
index 516814b63b4..cbb8e98e7e0 100644
--- a/source/blender/editors/space_graph/graph_buttons.c
+++ b/source/blender/editors/space_graph/graph_buttons.c
@@ -616,30 +616,31 @@ static void graph_panel_driverVar__rotDiff(uiLayout *layout, ID *id, DriverVar *
 	Object *ob2 = (Object *)dtar2->id;
 	PointerRNA dtar_ptr, dtar2_ptr;
 	uiLayout *col;
-	
+
 	/* initialize RNA pointer to the target */
-	RNA_pointer_create(id, &RNA_DriverTarget, dtar, &dtar_ptr); 
-	RNA_pointer_create(id, &RNA_DriverTarget, dtar2, &dtar2_ptr); 
-	
-	/* Bone 1 */
+	RNA_pointer_create(id, &RNA_DriverTarget, dtar, &dtar_ptr);
+	RNA_pointer_create(id, &RNA_DriverTarget, dtar2, &dtar2_ptr);
+
+	/* Object 1 */
 	col = uiLayoutColumn(layout, true);
 	uiLayoutSetRedAlert(col, (dtar->flag & DTAR_FLAG_INVALID)); /* XXX: per field... */
-	uiItemR(col, &dtar_ptr, "id", 0, IFACE_("Bone 1"), ICON_NONE);
-	
+	uiItemR(col, &dtar_ptr, "id", 0, IFACE_("Object 1"), ICON_NONE);
+
 	if (dtar->id && GS(dtar->id->name) == ID_OB && ob1->pose) {
 		PointerRNA tar_ptr;
-		
+
 		RNA_pointer_create(dtar->id, &RNA_Pose, ob1->pose, &tar_ptr);
 		uiItemPointerR(col, &dtar_ptr, "bone_target", &tar_ptr, "bones", "", ICON_BONE_DATA);
 	}
-	
+
+	/* Object 2 */
 	col = uiLayoutColumn(layout, true);
 	uiLayoutSetRedAlert(col, (dtar2->flag & DTAR_FLAG_INVALID)); /* XXX: per field... */
-	uiItemR(col, &dtar2_ptr, "id", 0, IFACE_("Bone 2"), ICON_NONE);
-		
+	uiItemR(col, &dtar2_ptr, "id", 0, IFACE_("Object 2"), ICON_NONE);
+
 	if (dtar2->id && GS(dtar2->id->name) == ID_OB && ob2->pose) {
 		PointerRNA tar_ptr;
-		
+
 		RNA_pointer_create(dtar2->id, &RNA_Pose, ob2->pose, &tar_ptr);
 		uiItemPointerR(col, &dtar2_ptr, "bone_target", &tar_ptr, "bones", "", ICON_BONE_DATA);
 	}
@@ -658,8 +659,8 @@ static void graph_panel_driverVar__locDiff(uiLayout *layout, ID *id, DriverVar *
 	/* initialize RNA pointer to the target */
 	RNA_pointer_create(id, &RNA_DriverTarget, dtar,  &dtar_ptr); 
 	RNA_pointer_create(id, &RNA_DriverTarget, dtar2, &dtar2_ptr); 
-	
-	/* Bone 1 */
+
+	/* Object 1 */
 	col = uiLayoutColumn(layout, true);
 	uiLayoutSetRedAlert(col, (dtar->flag & DTAR_FLAG_INVALID)); /* XXX: per field... */
 	uiItemR(col, &dtar_ptr, "id", 0, IFACE_("Object 1"), ICON_NONE);
@@ -673,7 +674,8 @@ static void graph_panel_driverVar__locDiff(uiLayout *layout, ID *id, DriverVar *
 	
 	uiLayoutSetRedAlert(col, false); /* we can clear it again now - it's only needed when creating the ID/Bone fields */
 	uiItemR(col, &dtar_ptr, "transform_space", 0, NULL, ICON_NONE);
-	
+
+	/* Object 2 */
 	col = uiLayoutColumn(layout, true);
 	uiLayoutSetRedAlert(col, (dtar2->flag & DTAR_FLAG_INVALID)); /* XXX: per field... */
 	uiItemR(col, &dtar2_ptr, "id", 0, IFACE_("Object 2"), ICON_NONE);
diff --git a/source/blender/editors/space_nla/nla_draw.c b/source/blender/editors/space_nla/nla_draw.c
index 5b3c062e16d..93dcdbb5c02 100644
--- a/source/blender/editors/space_nla/nla_draw.c
+++ b/source/blender/editors/space_nla/nla_draw.c
@@ -290,7 +290,8 @@ static void nla_draw_strip_curves(NlaStrip *strip, float yminc, float ymaxc)
 		 *	- min y-val is yminc, max is y-maxc, so clamp in those regions
 		 */
 		for (cfra = strip->start; cfra <= strip->end; cfra += 1.0f) {
-			float y = evaluate_fcurve(fcu, cfra);    // assume this to be in 0-1 range
+			float y = evaluate_fcurve(fcu, cfra);
+			CLAMP(y, 0.0f, 1.0f);
 			glVertex2f(cfra, ((y * yheight) + yminc));
 		}
 		glEnd(); // GL_LINE_STRIP
diff --git a/source/blender/editors/space_node/drawnode.c b/source/blender/editors/space_node/drawnode.c
index a6ee80d6feb..8dba0e39524 100644
--- a/source/blender/editors/space_node/drawnode.c
+++ b/source/blender/editors/space_node/drawnode.c
@@ -1321,9 +1321,6 @@ static void node_composit_buts_renderlayers(uiLayout *layout, bContext *C, Point
 	PropertyRNA *prop;
 	const char *layer_name;
 	char scene_name[MAX_ID_NAME - 2];
-	wmOperatorType *ot = WM_operatortype_find("RENDER_OT_render", 1);
-
-	BLI_assert(ot != 0);
 
 	uiTemplateID(layout, C, ptr, "scene", NULL, NULL, NULL);
 	
@@ -1340,11 +1337,9 @@ static void node_composit_buts_renderlayers(uiLayout *layout, bContext *C, Point
 	scn_ptr = RNA_pointer_get(ptr, "scene");
 	RNA_string_get(&scn_ptr, "name", scene_name);
 
-	WM_operator_properties_create_ptr(&op_ptr, ot);
+	op_ptr = uiItemFullO(row, "RENDER_OT_render", "", ICON_RENDER_STILL, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 	RNA_string_set(&op_ptr, "layer", layer_name);
 	RNA_string_set(&op_ptr, "scene", scene_name);
-	uiItemFullO_ptr(row, ot, "", ICON_RENDER_STILL, op_ptr.data, WM_OP_INVOKE_DEFAULT, 0);
-
 }
 
 
@@ -1793,6 +1788,7 @@ static void node_composit_buts_file_output_ex(uiLayout *layout, bContext *C, Poi
 	Scene *scene = CTX_data_scene(C);
 	PointerRNA imfptr = RNA_pointer_get(ptr, "format");
 	PointerRNA active_input_ptr, op_ptr;
+	wmOperatorType *ot;
 	uiLayout *row, *col;
 	int active_index;
 	const bool multilayer = RNA_enum_get(&imfptr, "file_format") == R_IMF_IMTYPE_MULTILAYER;
@@ -1831,11 +1827,10 @@ static void node_composit_buts_file_output_ex(uiLayout *layout, bContext *C, Poi
 	active_input_ptr.id.data = ptr->id.data;
 	
 	col = uiLayoutColumn(row, true);
-	op_ptr = uiItemFullO(col, "NODE_OT_output_file_move_active_socket", "",
-	                     ICON_TRIA_UP, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+	ot = WM_operatortype_find("NODE_OT_output_file_move_active_socket", false);
+	op_ptr = uiItemFullO_ptr(col, ot, "", ICON_TRIA_UP, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 	RNA_enum_set(&op_ptr, "direction", 1);
-	op_ptr = uiItemFullO(col, "NODE_OT_output_file_move_active_socket", "",
-	                     ICON_TRIA_DOWN, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+	op_ptr = uiItemFullO_ptr(col, ot, "", ICON_TRIA_DOWN, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 	RNA_enum_set(&op_ptr, "direction", 2);
 	
 	if (active_input_ptr.data) {
@@ -2167,14 +2162,7 @@ static void node_composit_buts_switch(uiLayout *layout, bContext *UNUSED(C), Poi
 
 static void node_composit_buts_switch_view_ex(uiLayout *layout, bContext *UNUSED(C), PointerRNA *UNUSED(ptr))
 {
-	PointerRNA op_ptr;
-	wmOperatorType *ot = WM_operatortype_find("NODE_OT_switch_view_update", 1);
-
-	BLI_assert(ot != 0);
-
-	WM_operator_properties_create_ptr(&op_ptr, ot);
-
-	uiItemFullO_ptr(layout, ot, "Update Views", ICON_FILE_REFRESH, op_ptr.data, WM_OP_INVOKE_DEFAULT, 0);
+	uiItemFullO(layout, "NODE_OT_switch_view_update", "Update Views", ICON_FILE_REFRESH, NULL, WM_OP_INVOKE_DEFAULT, 0);
 }
 
 static void node_composit_buts_boxmask(uiLayout *layout, bContext *UNUSED(C), PointerRNA *ptr)
diff --git a/source/blender/editors/space_node/node_add.c b/source/blender/editors/space_node/node_add.c
index d49df2afbd4..bcd0f6623e1 100644
--- a/source/blender/editors/space_node/node_add.c
+++ b/source/blender/editors/space_node/node_add.c
@@ -44,6 +44,8 @@
 #include "BKE_main.h"
 #include "BKE_node.h"
 #include "BKE_report.h"
+#include "BKE_scene.h"
+#include "BKE_texture.h"
 
 #include "ED_node.h"  /* own include */
 #include "ED_screen.h"
@@ -312,7 +314,10 @@ static int node_add_file_exec(bContext *C, wmOperator *op)
 
 	switch (snode->nodetree->type) {
 		case NTREE_SHADER:
-			type = SH_NODE_TEX_IMAGE;
+			if (BKE_scene_use_new_shading_nodes(CTX_data_scene(C)))
+				type = SH_NODE_TEX_IMAGE;
+			else
+				type = SH_NODE_TEXTURE;
 			break;
 		case NTREE_TEXTURE:
 			type = TEX_NODE_IMAGE;
@@ -333,7 +338,14 @@ static int node_add_file_exec(bContext *C, wmOperator *op)
 		return OPERATOR_CANCELLED;
 	}
 	
-	node->id = (ID *)ima;
+	if (type == SH_NODE_TEXTURE) {
+		Tex *tex = BKE_texture_add(CTX_data_main(C), DATA_(ima->id.name));
+		tex->ima = ima;
+		node->id = (ID *)tex;
+		WM_event_add_notifier(C, NC_TEXTURE | NA_ADDED, node->id);
+	}
+	else
+		node->id = (ID *)ima;
 
 	/* When adding new image file via drag-drop we need to load imbuf in order
 	 * to get proper image source.
diff --git a/source/blender/editors/space_node/node_buttons.c b/source/blender/editors/space_node/node_buttons.c
index f0567924edd..925298451ce 100644
--- a/source/blender/editors/space_node/node_buttons.c
+++ b/source/blender/editors/space_node/node_buttons.c
@@ -133,6 +133,7 @@ static void node_tree_interface_panel(const bContext *C, Panel *pa)
 	int in_out;
 	uiLayout *layout = pa->layout, *row, *split, *col;
 	PointerRNA ptr, sockptr, opptr;
+	wmOperatorType *ot;
 
 	if (!ntree)
 		return;
@@ -146,23 +147,25 @@ static void node_tree_interface_panel(const bContext *C, Panel *pa)
 	
 	split = uiLayoutRow(row, true);
 	col = uiLayoutColumn(split, true);
+	ot = WM_operatortype_find("NODE_OT_tree_socket_add", false);
 	uiItemL(col, IFACE_("Inputs:"), ICON_NONE);
 	uiTemplateList(col, (bContext *)C, "NODE_UL_interface_sockets", "inputs", &ptr, "inputs", &ptr, "active_input",
 	               NULL, 0, 0, 0, 0);
-	opptr = uiItemFullO(col, "NODE_OT_tree_socket_add", "", ICON_PLUS, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+	opptr = uiItemFullO_ptr(col, ot, "", ICON_PLUS, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 	RNA_enum_set(&opptr, "in_out", SOCK_IN);
 	
 	col = uiLayoutColumn(split, true);
 	uiItemL(col, IFACE_("Outputs:"), ICON_NONE);
 	uiTemplateList(col, (bContext *)C, "NODE_UL_interface_sockets", "outputs", &ptr, "outputs", &ptr, "active_output",
 	               NULL, 0, 0, 0, 0);
-	opptr = uiItemFullO(col, "NODE_OT_tree_socket_add", "", ICON_PLUS, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+	opptr = uiItemFullO_ptr(col, ot, "", ICON_PLUS, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 	RNA_enum_set(&opptr, "in_out", SOCK_OUT);
 	
+	ot = WM_operatortype_find("NODE_OT_tree_socket_move", false);
 	col = uiLayoutColumn(row, true);
-	opptr = uiItemFullO(col, "NODE_OT_tree_socket_move", "", ICON_TRIA_UP, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+	opptr = uiItemFullO_ptr(col, ot, "", ICON_TRIA_UP, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 	RNA_enum_set(&opptr, "direction", 1);
-	opptr = uiItemFullO(col, "NODE_OT_tree_socket_move", "", ICON_TRIA_DOWN, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+	opptr = uiItemFullO_ptr(col, ot, "", ICON_TRIA_DOWN, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 	RNA_enum_set(&opptr, "direction", 2);
 	
 	if (sock) {
diff --git a/source/blender/editors/space_node/node_edit.c b/source/blender/editors/space_node/node_edit.c
index ffe510016ff..fdfe316f5ed 100644
--- a/source/blender/editors/space_node/node_edit.c
+++ b/source/blender/editors/space_node/node_edit.c
@@ -582,7 +582,7 @@ void snode_set_context(const bContext *C)
 		}
 	}
 	
-	if (snode->nodetree != ntree || snode->id != id || snode->from != from) {
+	if (snode->nodetree != ntree || snode->id != id || snode->from != from || snode->treepath.last == NULL) {
 		ED_node_tree_start(snode, ntree, id, from);
 	}
 	
@@ -1069,12 +1069,9 @@ int node_find_indicated_socket(SpaceNode *snode, bNode **nodep, bNodeSocket **so
 	
 	/* check if we click in a socket */
 	for (node = snode->edittree->nodes.first; node; node = node->next) {
-		
-		rect.xmin = cursor[0] - (NODE_SOCKSIZE + 4);
-		rect.ymin = cursor[1] - (NODE_SOCKSIZE + 4);
-		rect.xmax = cursor[0] + (NODE_SOCKSIZE + 4);
-		rect.ymax = cursor[1] + (NODE_SOCKSIZE + 4);
-		
+
+		BLI_rctf_init_pt_radius(&rect, cursor, NODE_SOCKSIZE + 4);
+
 		if (!(node->flag & NODE_HIDDEN)) {
 			/* extra padding inside and out - allow dragging on the text areas too */
 			if (in_out == SOCK_IN) {
diff --git a/source/blender/editors/space_outliner/outliner_draw.c b/source/blender/editors/space_outliner/outliner_draw.c
index 99242fd12f9..121fe812016 100644
--- a/source/blender/editors/space_outliner/outliner_draw.c
+++ b/source/blender/editors/space_outliner/outliner_draw.c
@@ -1040,9 +1040,6 @@ static void tselem_draw_icon(uiBlock *block, int xmax, float x, float y, TreeSto
 	struct DrawIconArg arg;
 	float aspect;
 	
-	/* icons tiny bit away from text */
-	x -= 0.15f * UI_UNIT_Y;
-	
 	/* make function calls a bit compacter */
 	arg.block = block;
 	arg.id = tselem->id;
@@ -1053,8 +1050,10 @@ static void tselem_draw_icon(uiBlock *block, int xmax, float x, float y, TreeSto
 	
 	/* placement of icons, copied from interface_widgets.c */
 	aspect = (0.8f * UI_UNIT_Y) / ICON_DEFAULT_HEIGHT;
-	arg.x = x = x + 4.0f * aspect;
-	arg.y = y = y + 0.1f * UI_UNIT_Y;
+	x += 2.0f * aspect;
+	y += 2.0f * aspect;
+	arg.x = x;
+	arg.y = y;
 
 	if (tselem->type) {
 		switch (tselem->type) {
@@ -1126,6 +1125,7 @@ static void tselem_draw_icon(uiBlock *block, int xmax, float x, float y, TreeSto
 					case eModifierType_Cast:
 						UI_icon_draw(x, y, ICON_MOD_CAST); break;
 					case eModifierType_MeshDeform:
+					case eModifierType_SurfaceDeform:
 						UI_icon_draw(x, y, ICON_MOD_MESHDEFORM); break;
 					case eModifierType_Bevel:
 						UI_icon_draw(x, y, ICON_MOD_BEVEL); break;
@@ -1388,9 +1388,9 @@ static void outliner_draw_iconrow(bContext *C, uiBlock *block, Scene *scene, Spa
 				UI_draw_roundbox_corner_set(UI_CNR_ALL);
 				glColor4ub(255, 255, 255, 100);
 				UI_draw_roundbox(
-				        (float) *offsx - 1.0f * ufac,
+				        (float) *offsx + 1.0f * ufac,
 				        (float)ys + 1.0f * ufac,
-				        (float)*offsx + UI_UNIT_X - 2.0f * ufac,
+				        (float)*offsx + UI_UNIT_X - 1.0f * ufac,
 				        (float)ys + UI_UNIT_Y - ufac,
 				        (float)UI_UNIT_Y / 2.0f - ufac);
 				glEnable(GL_BLEND); /* roundbox disables */
@@ -1535,9 +1535,9 @@ static void outliner_draw_tree_element(
 		if (active != OL_DRAWSEL_NONE) {
 			UI_draw_roundbox_corner_set(UI_CNR_ALL);
 			UI_draw_roundbox(
-			        (float)startx + UI_UNIT_X,
+			        (float)startx + UI_UNIT_X + 1.0f * ufac,
 			        (float)*starty + 1.0f * ufac,
-			        (float)startx + 2.0f * UI_UNIT_X - 2.0f * ufac,
+			        (float)startx + 2.0f * UI_UNIT_X - 1.0f * ufac,
 			        (float)*starty + UI_UNIT_Y - 1.0f * ufac,
 			        UI_UNIT_Y / 2.0f - 1.0f * ufac);
 			glEnable(GL_BLEND); /* roundbox disables it */
@@ -1548,16 +1548,13 @@ static void outliner_draw_tree_element(
 		/* open/close icon, only when sublevels, except for scene */
 		if (te->subtree.first || (tselem->type == 0 && te->idcode == ID_SCE) || (te->flag & TE_LAZY_CLOSED)) {
 			int icon_x;
-			if (tselem->type == 0 && ELEM(te->idcode, ID_OB, ID_SCE))
-				icon_x = startx;
-			else
-				icon_x = startx + 5 * ufac;
+			icon_x = startx;
 			
 			// icons a bit higher
 			if (TSELEM_OPEN(tselem, soops))
-				UI_icon_draw((float)icon_x, (float)*starty + 2 * ufac, ICON_DISCLOSURE_TRI_DOWN);
+				UI_icon_draw((float)icon_x + 2 * ufac, (float)*starty + 1 * ufac, ICON_DISCLOSURE_TRI_DOWN);
 			else
-				UI_icon_draw((float)icon_x, (float)*starty + 2 * ufac, ICON_DISCLOSURE_TRI_RIGHT);
+				UI_icon_draw((float)icon_x + 2 * ufac, (float)*starty + 1 * ufac, ICON_DISCLOSURE_TRI_RIGHT);
 		}
 		offsx += UI_UNIT_X;
 		
@@ -1567,7 +1564,7 @@ static void outliner_draw_tree_element(
 			
 			tselem_draw_icon(block, xmax, (float)startx + offsx, (float)*starty, tselem, te, 1.0f);
 			
-			offsx += UI_UNIT_X;
+			offsx += UI_UNIT_X + 2 * ufac;
 		}
 		else
 			offsx += 2 * ufac;
@@ -1575,16 +1572,16 @@ static void outliner_draw_tree_element(
 		if (tselem->type == 0 && ID_IS_LINKED_DATABLOCK(tselem->id)) {
 			glPixelTransferf(GL_ALPHA_SCALE, 0.5f);
 			if (tselem->id->tag & LIB_TAG_MISSING) {
-				UI_icon_draw((float)startx + offsx, (float)*starty + 2 * ufac, ICON_LIBRARY_DATA_BROKEN);
+				UI_icon_draw((float)startx + offsx + 2 * ufac, (float)*starty + 2 * ufac, ICON_LIBRARY_DATA_BROKEN);
 			}
 			else if (tselem->id->tag & LIB_TAG_INDIRECT) {
-				UI_icon_draw((float)startx + offsx, (float)*starty + 2 * ufac, ICON_LIBRARY_DATA_INDIRECT);
+				UI_icon_draw((float)startx + offsx + 2 * ufac, (float)*starty + 2 * ufac, ICON_LIBRARY_DATA_INDIRECT);
 			}
 			else {
-				UI_icon_draw((float)startx + offsx, (float)*starty + 2 * ufac, ICON_LIBRARY_DATA_DIRECT);
+				UI_icon_draw((float)startx + offsx + 2 * ufac, (float)*starty + 2 * ufac, ICON_LIBRARY_DATA_DIRECT);
 			}
 			glPixelTransferf(GL_ALPHA_SCALE, 1.0f);
-			offsx += UI_UNIT_X;
+			offsx += UI_UNIT_X + 2 * ufac;
 		}
 		glDisable(GL_BLEND);
 		
@@ -1756,7 +1753,7 @@ static void outliner_draw_tree(bContext *C, uiBlock *block, Scene *scene, ARegio
 	// gray hierarchy lines
 	UI_ThemeColorBlend(TH_BACK, TH_TEXT, 0.4f);
 	starty = (int)ar->v2d.tot.ymax - UI_UNIT_Y / 2 - OL_Y_OFFSET;
-	startx = 6;
+	startx = UI_UNIT_X / 2 - 1.0f;
 	outliner_draw_hierarchy(soops, &soops->tree, startx, &starty);
 	
 	// items themselves
diff --git a/source/blender/editors/space_outliner/outliner_edit.c b/source/blender/editors/space_outliner/outliner_edit.c
index 345ac353c11..335eb95da0e 100644
--- a/source/blender/editors/space_outliner/outliner_edit.c
+++ b/source/blender/editors/space_outliner/outliner_edit.c
@@ -309,9 +309,12 @@ static void id_delete(bContext *C, ReportList *reports, TreeElement *te, TreeSto
 	ID *id = tselem->id;
 
 	BLI_assert(te->idcode != 0 && id != NULL);
-	BLI_assert(te->idcode != ID_LI || ((Library *)id)->parent == NULL);
 	UNUSED_VARS_NDEBUG(te);
 
+	if (te->idcode == ID_LI && ((Library *)id)->parent != NULL) {
+		BKE_reportf(reports, RPT_WARNING, "Cannot delete indirectly linked library '%s'", id->name);
+		return;
+	}
 	if (id->tag & LIB_TAG_INDIRECT) {
 		BKE_reportf(reports, RPT_WARNING, "Cannot delete indirectly linked id '%s'", id->name);
 		return;
@@ -516,7 +519,8 @@ void OUTLINER_OT_id_remap(wmOperatorType *ot)
 
 	ot->flag = 0;
 
-	RNA_def_enum(ot->srna, "id_type", rna_enum_id_type_items, ID_OB, "ID Type", "");
+	prop = RNA_def_enum(ot->srna, "id_type", rna_enum_id_type_items, ID_OB, "ID Type", "");
+	RNA_def_property_translation_context(prop, BLT_I18NCONTEXT_ID_ID);
 
 	prop = RNA_def_enum(ot->srna, "old_id", DummyRNA_NULL_items, 0, "Old ID", "Old ID to replace");
 	RNA_def_property_enum_funcs_runtime(prop, NULL, NULL, outliner_id_itemf);
@@ -2068,74 +2072,62 @@ static int parent_drop_invoke(bContext *C, wmOperator *op, const wmEvent *event)
 			wmOperatorType *ot = WM_operatortype_find("OUTLINER_OT_parent_drop", false);
 			uiPopupMenu *pup = UI_popup_menu_begin(C, IFACE_("Set Parent To"), ICON_NONE);
 			uiLayout *layout = UI_popup_menu_layout(pup);
-			
 			PointerRNA ptr;
 			
-			WM_operator_properties_create_ptr(&ptr, ot);
+			/* Cannot use uiItemEnumO()... have multiple properties to set. */
+			ptr = uiItemFullO_ptr(layout, ot, IFACE_("Object"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 			RNA_string_set(&ptr, "parent", parname);
 			RNA_string_set(&ptr, "child", childname);
 			RNA_enum_set(&ptr, "type", PAR_OBJECT);
-			/* Cannot use uiItemEnumO()... have multiple properties to set. */
-			uiItemFullO_ptr(layout, ot, IFACE_("Object"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
-			
+
 			/* par becomes parent, make the associated menus */
 			if (par->type == OB_ARMATURE) {
-				WM_operator_properties_create_ptr(&ptr, ot);
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("Armature Deform"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_ARMATURE);
-				uiItemFullO_ptr(layout, ot, IFACE_("Armature Deform"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
-				
-				WM_operator_properties_create_ptr(&ptr, ot);
+
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("   With Empty Groups"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_ARMATURE_NAME);
-				uiItemFullO_ptr(layout, ot, IFACE_("   With Empty Groups"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
-				
-				WM_operator_properties_create_ptr(&ptr, ot);
+
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("   With Envelope Weights"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_ARMATURE_ENVELOPE);
-				uiItemFullO_ptr(layout, ot, IFACE_("   With Envelope Weights"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
-				
-				WM_operator_properties_create_ptr(&ptr, ot);
+
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("   With Automatic Weights"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_ARMATURE_AUTO);
-				uiItemFullO_ptr(layout, ot, IFACE_("   With Automatic Weights"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
-				
-				WM_operator_properties_create_ptr(&ptr, ot);
+
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("Bone"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_BONE);
-				uiItemFullO_ptr(layout, ot, IFACE_("Bone"),
-				            0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
 			}
 			else if (par->type == OB_CURVE) {
-				WM_operator_properties_create_ptr(&ptr, ot);
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("Curve Deform"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_CURVE);
-				uiItemFullO_ptr(layout, ot, IFACE_("Curve Deform"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
-				
-				WM_operator_properties_create_ptr(&ptr, ot);
+
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("Follow Path"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_FOLLOW);
-				uiItemFullO_ptr(layout, ot, IFACE_("Follow Path"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
-				
-				WM_operator_properties_create_ptr(&ptr, ot);
+
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("Path Constraint"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_PATH_CONST);
-				uiItemFullO_ptr(layout, ot, IFACE_("Path Constraint"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
 			}
 			else if (par->type == OB_LATTICE) {
-				WM_operator_properties_create_ptr(&ptr, ot);
+				ptr = uiItemFullO_ptr(layout, ot, IFACE_("Lattice Deform"), 0, NULL, WM_OP_EXEC_DEFAULT, UI_ITEM_O_RETURN_PROPS);
 				RNA_string_set(&ptr, "parent", parname);
 				RNA_string_set(&ptr, "child", childname);
 				RNA_enum_set(&ptr, "type", PAR_LATTICE);
-				uiItemFullO_ptr(layout, ot, IFACE_("Lattice Deform"), 0, ptr.data, WM_OP_EXEC_DEFAULT, 0);
 			}
 			
 			UI_popup_menu_end(C, pup);
diff --git a/source/blender/editors/space_outliner/outliner_tools.c b/source/blender/editors/space_outliner/outliner_tools.c
index 4eda7977622..086cc55366b 100644
--- a/source/blender/editors/space_outliner/outliner_tools.c
+++ b/source/blender/editors/space_outliner/outliner_tools.c
@@ -1254,6 +1254,7 @@ static int outliner_id_operation_exec(bContext *C, wmOperator *op)
 		{
 			if (idlevel > 0) {
 				outliner_do_libdata_operation(C, op->reports, scene, soops, &soops->tree, id_delete_cb, NULL);
+				ED_undo_push(C, "Delete");
 			}
 			break;
 		}
@@ -1261,6 +1262,7 @@ static int outliner_id_operation_exec(bContext *C, wmOperator *op)
 		{
 			if (idlevel > 0) {
 				outliner_do_libdata_operation(C, op->reports, scene, soops, &soops->tree, id_remap_cb, NULL);
+				ED_undo_push(C, "Remap");
 			}
 			break;
 		}
@@ -1369,18 +1371,20 @@ static int outliner_lib_operation_exec(bContext *C, wmOperator *op)
 			outliner_do_libdata_operation(C, op->reports, scene, soops, &soops->tree, item_rename_cb, NULL);
 
 			WM_event_add_notifier(C, NC_ID | NA_EDITED, NULL);
-			ED_undo_push(C, "Rename");
+			ED_undo_push(C, "Rename Library");
 			break;
 		}
 		case OL_LIB_DELETE:
 		{
 			outliner_do_libdata_operation(C, op->reports, scene, soops, &soops->tree, id_delete_cb, NULL);
+			ED_undo_push(C, "Delete Library");
 			break;
 		}
 		case OL_LIB_RELOCATE:
 		{
 			/* rename */
 			outliner_do_libdata_operation(C, op->reports, scene, soops, &soops->tree, lib_relocate_cb, NULL);
+			ED_undo_push(C, "Relocate Library");
 			break;
 		}
 		case OL_LIB_RELOAD:
diff --git a/source/blender/editors/space_sequencer/sequencer_add.c b/source/blender/editors/space_sequencer/sequencer_add.c
index ede6b7ce469..46f212e3679 100644
--- a/source/blender/editors/space_sequencer/sequencer_add.c
+++ b/source/blender/editors/space_sequencer/sequencer_add.c
@@ -38,6 +38,8 @@
 #include "BLI_math.h"
 #include "BLI_utildefines.h"
 
+#include "BLT_translation.h"
+
 #include "DNA_scene_types.h"
 #include "DNA_mask_types.h"
 
@@ -445,6 +447,7 @@ void SEQUENCER_OT_movieclip_strip_add(struct wmOperatorType *ot)
 	sequencer_generic_props__internal(ot, SEQPROP_STARTFRAME);
 	prop = RNA_def_enum(ot->srna, "clip", DummyRNA_NULL_items, 0, "Clip", "");
 	RNA_def_enum_funcs(prop, RNA_movieclip_itemf);
+	RNA_def_property_translation_context(prop, BLT_I18NCONTEXT_ID_MOVIECLIP);
 	RNA_def_property_flag(prop, PROP_ENUM_NO_TRANSLATE);
 	ot->prop = prop;
 }
diff --git a/source/blender/editors/space_sequencer/sequencer_draw.c b/source/blender/editors/space_sequencer/sequencer_draw.c
index e1768e4aedc..70a6e6d83cb 100644
--- a/source/blender/editors/space_sequencer/sequencer_draw.c
+++ b/source/blender/editors/space_sequencer/sequencer_draw.c
@@ -545,7 +545,8 @@ static void draw_seq_text(View2D *v2d, SpaceSeq *sseq, Sequence *seq, float x1,
 		if ((sseq->flag & SEQ_ALL_WAVEFORMS) || (seq->flag & SEQ_AUDIO_DRAW_WAVEFORM)) {
 			str[0] = 0;
 			str_len = 0;
-		} else if (seq->sound) {
+		}
+		else if (seq->sound) {
 			str_len = BLI_snprintf(str, sizeof(str), "%s: %s | %d",
 			                       name, seq->sound->name, seq->len);
 		}
diff --git a/source/blender/editors/space_text/text_draw.c b/source/blender/editors/space_text/text_draw.c
index 81605a80f69..70b5feac280 100644
--- a/source/blender/editors/space_text/text_draw.c
+++ b/source/blender/editors/space_text/text_draw.c
@@ -926,6 +926,7 @@ static void draw_textscroll(const SpaceText *st, rcti *scroll, rcti *back)
 
 /*********************** draw documentation *******************************/
 
+#if 0
 static void draw_documentation(const SpaceText *st, ARegion *ar)
 {
 	TextDrawContext tdc = {0};
@@ -1014,6 +1015,7 @@ static void draw_documentation(const SpaceText *st, ARegion *ar)
 		draw_documentation(st, ar);
 	}
 }
+#endif
 
 /*********************** draw suggestion list *******************************/
 
@@ -1487,7 +1489,7 @@ void draw_text_main(SpaceText *st, ARegion *ar)
 	/* draw other stuff */
 	draw_brackets(st, &tdc, ar);
 	draw_textscroll(st, &scroll, &back);
-	draw_documentation(st, ar);
+	/* draw_documentation(st, ar); - No longer supported */
 	draw_suggestion_list(st, &tdc, ar);
 	
 	text_font_end(&tdc);
diff --git a/source/blender/editors/space_view3d/drawarmature.c b/source/blender/editors/space_view3d/drawarmature.c
index 149144d969a..314b7bf3335 100644
--- a/source/blender/editors/space_view3d/drawarmature.c
+++ b/source/blender/editors/space_view3d/drawarmature.c
@@ -1265,7 +1265,7 @@ static void draw_b_bone(const short dt, int armflag, int boneflag, short constfl
 	else {
 		/* wire */
 		if (armflag & ARM_POSEMODE) {
-			if (constflag) {
+			if (constflag && ((G.f & G_PICKSEL) == 0)) {
 				/* set constraint colors */
 				if (set_pchan_glColor(PCHAN_COLOR_CONSTS, boneflag, constflag)) {
 					glEnable(GL_BLEND);
@@ -1406,7 +1406,7 @@ static void draw_bone(const short dt, int armflag, int boneflag, short constflag
 			set_ebone_glColor(boneflag);
 		}
 		else if (armflag & ARM_POSEMODE) {
-			if (constflag) {
+			if (constflag && ((G.f & G_PICKSEL) == 0)) {
 				/* draw constraint colors */
 				if (set_pchan_glColor(PCHAN_COLOR_CONSTS, boneflag, constflag)) {
 					glEnable(GL_BLEND);
@@ -2360,7 +2360,6 @@ static void draw_ebones(View3D *v3d, ARegion *ar, Object *ob, const short dt)
 						/*	Draw name */
 						if (arm->flag & ARM_DRAWNAMES) {
 							mid_v3_v3v3(vec, eBone->head, eBone->tail);
-							glRasterPos3fv(vec);
 							view3d_cached_text_draw_add(vec, eBone->name, strlen(eBone->name), 10, 0, col);
 						}
 						/*	Draw additional axes */
diff --git a/source/blender/editors/space_view3d/drawobject.c b/source/blender/editors/space_view3d/drawobject.c
index 90d33dc5995..be2e4ab05e0 100644
--- a/source/blender/editors/space_view3d/drawobject.c
+++ b/source/blender/editors/space_view3d/drawobject.c
@@ -7171,8 +7171,9 @@ static void drawtexspace(Object *ob)
 }
 
 /* draws wire outline */
-static void drawObjectSelect(Scene *scene, View3D *v3d, ARegion *ar, Base *base,
-                             const unsigned char ob_wire_col[4])
+static void draw_object_selected_outline(
+        Scene *scene, View3D *v3d, ARegion *ar, Base *base,
+        const unsigned char ob_wire_col[4])
 {
 	RegionView3D *rv3d = ar->regiondata;
 	Object *ob = base->object;
@@ -7645,7 +7646,7 @@ void draw_object(Scene *scene, ARegion *ar, View3D *v3d, Base *base, const short
 		if ((v3d->flag & V3D_SELECT_OUTLINE) && !render_override && ob->type != OB_MESH) {
 			if (dt > OB_WIRE && (ob->mode & OB_MODE_EDIT) == 0 && (dflag & DRAW_SCENESET) == 0) {
 				if (!(ob->dtx & OB_DRAWWIRE) && (ob->flag & SELECT) && !(dflag & (DRAW_PICKING | DRAW_CONSTCOLOR))) {
-					drawObjectSelect(scene, v3d, ar, base, ob_wire_col);
+					draw_object_selected_outline(scene, v3d, ar, base, ob_wire_col);
 				}
 			}
 		}
@@ -8174,6 +8175,50 @@ void draw_object(Scene *scene, ARegion *ar, View3D *v3d, Base *base, const short
 	ED_view3d_clear_mats_rv3d(rv3d);
 }
 
+
+/**
+ * Drawing for selection picking,
+ * caller must have called 'GPU_select_load_id(base->selcode)' first.
+ */
+void draw_object_select(Scene *scene, ARegion *ar, View3D *v3d, Base *base, const short dflag)
+{
+	BLI_assert(dflag & DRAW_PICKING && dflag & DRAW_CONSTCOLOR);
+	draw_object(scene, ar, v3d, base, dflag);
+
+	/* we draw duplicators for selection too */
+	if ((base->object->transflag & OB_DUPLI)) {
+		ListBase *lb;
+		DupliObject *dob;
+		Base tbase;
+
+		tbase.flag = OB_FROMDUPLI;
+		lb = object_duplilist(G.main->eval_ctx, scene, base->object);
+
+		for (dob = lb->first; dob; dob = dob->next) {
+			float omat[4][4];
+			char dt;
+			short dtx;
+
+			tbase.object = dob->ob;
+			copy_m4_m4(omat, dob->ob->obmat);
+			copy_m4_m4(dob->ob->obmat, dob->mat);
+
+			/* extra service: draw the duplicator in drawtype of parent */
+			/* MIN2 for the drawtype to allow bounding box objects in groups for lods */
+			dt = tbase.object->dt;   tbase.object->dt = MIN2(tbase.object->dt, base->object->dt);
+			dtx = tbase.object->dtx; tbase.object->dtx = base->object->dtx;
+
+			draw_object(scene, ar, v3d, &tbase, dflag);
+
+			tbase.object->dt = dt;
+			tbase.object->dtx = dtx;
+
+			copy_m4_m4(dob->ob->obmat, omat);
+		}
+		free_object_duplilist(lb);
+	}
+}
+
 /* ***************** BACKBUF SEL (BBS) ********* */
 
 static void bbs_obmode_mesh_verts__mapFunc(void *userData, int index, const float co[3],
diff --git a/source/blender/editors/space_view3d/drawvolume.c b/source/blender/editors/space_view3d/drawvolume.c
index 27ecbf83db5..c076bfb4aa4 100644
--- a/source/blender/editors/space_view3d/drawvolume.c
+++ b/source/blender/editors/space_view3d/drawvolume.c
@@ -737,7 +737,7 @@ static void add_streamline(float (*verts)[3], float(*colors)[3], float center[3]
 	copy_v3_v3(verts[(*offset)++], center);
 }
 
-typedef void (*vector_draw_func)(float(*)[3], float(*)[3], float*, float*, float, float, int*);
+typedef void (*vector_draw_func)(float(*)[3], float(*)[3], float *, float *, float, float, int *);
 #endif  /* WITH_SMOKE */
 
 void draw_smoke_velocity(SmokeDomainSettings *domain, float viewnormal[3])
@@ -774,8 +774,8 @@ void draw_smoke_velocity(SmokeDomainSettings *domain, float viewnormal[3])
 
 	float min[3] = {
 	    domain->p0[0] - domain->cell_size[0] * domain->adapt_res,
-		domain->p0[1] - domain->cell_size[1] * domain->adapt_res,
-		domain->p0[2] - domain->cell_size[2] * domain->adapt_res,
+	    domain->p0[1] - domain->cell_size[1] * domain->adapt_res,
+	    domain->p0[2] - domain->cell_size[2] * domain->adapt_res,
 	};
 
 	int num_points_v[3] = {
diff --git a/source/blender/editors/space_view3d/space_view3d.c b/source/blender/editors/space_view3d/space_view3d.c
index 075b1faf502..b8228c63209 100644
--- a/source/blender/editors/space_view3d/space_view3d.c
+++ b/source/blender/editors/space_view3d/space_view3d.c
@@ -180,8 +180,8 @@ bool ED_view3d_context_user_region(bContext *C, View3D **r_v3d, ARegion **r_ar)
 		View3D *v3d = (View3D *)sa->spacedata.first;
 
 		if (ar) {
-			RegionView3D *rv3d = ar->regiondata;
-			if (rv3d && (rv3d->viewlock & RV3D_LOCKED) == 0) {
+			RegionView3D *rv3d;
+			if ((ar->regiontype == RGN_TYPE_WINDOW) && (rv3d = ar->regiondata) && (rv3d->viewlock & RV3D_LOCKED) == 0) {
 				*r_v3d = v3d;
 				*r_ar = ar;
 				return true;
@@ -869,6 +869,7 @@ static void view3d_main_region_listener(bScreen *sc, ScrArea *sa, ARegion *ar, w
 				case ND_CONSTRAINT:
 				case ND_KEYS:
 				case ND_PARTICLE:
+				case ND_POINTCACHE:
 				case ND_LOD:
 					ED_region_tag_redraw(ar);
 					break;
@@ -1256,21 +1257,6 @@ static void space_view3d_listener(bScreen *UNUSED(sc), ScrArea *sa, struct wmNot
 			}
 			break;
 	}
-
-	/* removed since BKE_image_user_frame_calc is now called in view3d_draw_bgpic because screen_ops doesnt call the notifier. */
-#if 0
-	if (wmn->category == NC_SCENE && wmn->data == ND_FRAME) {
-		View3D *v3d = area->spacedata.first;
-		BGpic *bgpic = v3d->bgpicbase.first;
-
-		for (; bgpic; bgpic = bgpic->next) {
-			if (bgpic->ima) {
-				Scene *scene = wmn->reference;
-				BKE_image_user_frame_calc(&bgpic->iuser, scene->r.cfra, 0);
-			}
-		}
-	}
-#endif
 }
 
 const char *view3d_context_dir[] = {
diff --git a/source/blender/editors/space_view3d/view3d_buttons.c b/source/blender/editors/space_view3d/view3d_buttons.c
index 351c7ccec15..34e01405e7e 100644
--- a/source/blender/editors/space_view3d/view3d_buttons.c
+++ b/source/blender/editors/space_view3d/view3d_buttons.c
@@ -814,10 +814,6 @@ static void view3d_panel_vgroup(const bContext *C, Panel *pa)
 	if (dv && dv->totweight) {
 		ToolSettings *ts = scene->toolsettings;
 
-		wmOperatorType *ot_weight_set_active = WM_operatortype_find("OBJECT_OT_vertex_weight_set_active", true);
-		wmOperatorType *ot_weight_paste = WM_operatortype_find("OBJECT_OT_vertex_weight_paste", true);
-		wmOperatorType *ot_weight_delete = WM_operatortype_find("OBJECT_OT_vertex_weight_delete", true);
-
 		wmOperatorType *ot;
 		PointerRNA op_ptr, tools_ptr;
 		PointerRNA *but_ptr;
@@ -856,7 +852,7 @@ static void view3d_panel_vgroup(const bContext *C, Panel *pa)
 
 					/* The Weight Group Name */
 
-					ot = ot_weight_set_active;
+					ot = WM_operatortype_find("OBJECT_OT_vertex_weight_set_active", true);
 					but = uiDefButO_ptr(block, UI_BTYPE_BUT, ot, WM_OP_EXEC_DEFAULT, dg->name,
 					                    xco, yco, (x = UI_UNIT_X * 5), UI_UNIT_Y, "");
 					but_ptr = UI_but_operator_ptr_get(but);
@@ -882,23 +878,16 @@ static void view3d_panel_vgroup(const bContext *C, Panel *pa)
 					xco += x;
 
 					/* The weight group paste function */
-
-					ot = ot_weight_paste;
-					WM_operator_properties_create_ptr(&op_ptr, ot);
-					RNA_int_set(&op_ptr, "weight_group", i);
 					icon = (locked) ? ICON_BLANK1 : ICON_PASTEDOWN;
-					uiItemFullO_ptr(row, ot, "", icon, op_ptr.data, WM_OP_INVOKE_DEFAULT, 0);
+					op_ptr = uiItemFullO(row, "OBJECT_OT_vertex_weight_paste", "", icon, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+					RNA_int_set(&op_ptr, "weight_group", i);
 
 					/* The weight entry delete function */
-
-					ot = ot_weight_delete;
-					WM_operator_properties_create_ptr(&op_ptr, ot);
-					RNA_int_set(&op_ptr, "weight_group", i);
 					icon = (locked) ? ICON_LOCKED : ICON_X;
-					uiItemFullO_ptr(row, ot, "", icon, op_ptr.data, WM_OP_INVOKE_DEFAULT, 0);
+					op_ptr = uiItemFullO(row, "OBJECT_OT_vertex_weight_delete", "", icon, NULL, WM_OP_INVOKE_DEFAULT, UI_ITEM_O_RETURN_PROPS);
+					RNA_int_set(&op_ptr, "weight_group", i);
 
 					yco -= UI_UNIT_Y;
-					
 				}
 			}
 		}
diff --git a/source/blender/editors/space_view3d/view3d_draw.c b/source/blender/editors/space_view3d/view3d_draw.c
index f23e587e55d..3fb2761d40e 100644
--- a/source/blender/editors/space_view3d/view3d_draw.c
+++ b/source/blender/editors/space_view3d/view3d_draw.c
@@ -100,6 +100,7 @@
 #include "GPU_material.h"
 #include "GPU_compositing.h"
 #include "GPU_extensions.h"
+#include "GPU_select.h"
 
 #include "view3d_intern.h"  /* own include */
 
@@ -2023,6 +2024,35 @@ static void view3d_draw_xraytransp(Scene *scene, ARegion *ar, View3D *v3d, const
 	glDepthMask(GL_TRUE);
 }
 
+/* clears zbuffer and draws it over,
+ * note that in the select version we don't care about transparent flag as with regular drawing */
+static void view3d_draw_xray_select(Scene *scene, ARegion *ar, View3D *v3d, bool *clear)
+{
+	/* Not ideal, but we need to read from the previous depths before clearing
+	 * otherwise we could have a function to load the depths after drawing.
+	 *
+	 * Clearing the depth buffer isn't all that common between drawing objects so accept this for now.
+	 */
+	if (U.gpu_select_pick_deph) {
+		GPU_select_load_id(-1);
+	}
+
+	View3DAfter *v3da;
+	if (*clear && v3d->zbuf) {
+		glClear(GL_DEPTH_BUFFER_BIT);
+		*clear = false;
+	}
+
+	v3d->xray = true;
+	while ((v3da = BLI_pophead(&v3d->afterdraw_xray))) {
+		if (GPU_select_load_id(v3da->base->selcol)) {
+			draw_object_select(scene, ar, v3d, v3da->base, v3da->dflag);
+		}
+		MEM_freeN(v3da);
+	}
+	v3d->xray = false;
+}
+
 /* *********************** */
 
 /*
@@ -2487,6 +2517,58 @@ void ED_view3d_draw_depth(Scene *scene, ARegion *ar, View3D *v3d, bool alphaover
 	U.obcenter_dia = obcenter_dia;
 }
 
+void ED_view3d_draw_select_loop(
+        ViewContext *vc, Scene *scene, View3D *v3d, ARegion *ar,
+        bool use_obedit_skip, bool use_nearest)
+{
+	short code = 1;
+	const short dflag = DRAW_PICKING | DRAW_CONSTCOLOR;
+
+	if (vc->obedit && vc->obedit->type == OB_MBALL) {
+		draw_object(scene, ar, v3d, BASACT, dflag);
+	}
+	else if ((vc->obedit && vc->obedit->type == OB_ARMATURE)) {
+		/* if not drawing sketch, draw bones */
+		if (!BDR_drawSketchNames(vc)) {
+			draw_object(scene, ar, v3d, BASACT, dflag);
+		}
+	}
+	else {
+		Base *base;
+
+		for (base = scene->base.first; base; base = base->next) {
+			if (base->lay & v3d->lay) {
+
+				if ((base->object->restrictflag & OB_RESTRICT_SELECT) ||
+				    (use_obedit_skip && (scene->obedit->data == base->object->data)))
+				{
+					base->selcol = 0;
+				}
+				else {
+					base->selcol = code;
+
+					if (use_nearest && (base->object->dtx & OB_DRAWXRAY)) {
+						ED_view3d_after_add(&v3d->afterdraw_xray, base, dflag);
+					}
+					else {
+						if (GPU_select_load_id(code)) {
+							draw_object_select(scene, ar, v3d, base, dflag);
+						}
+					}
+					code++;
+				}
+			}
+		}
+
+		if (use_nearest) {
+			bool xrayclear = true;
+			if (v3d->afterdraw_xray.first) {
+				view3d_draw_xray_select(scene, ar, v3d, &xrayclear);
+			}
+		}
+	}
+}
+
 typedef struct View3DShadow {
 	struct View3DShadow *next, *prev;
 	GPULamp *lamp;
@@ -2955,7 +3037,7 @@ struct RV3DMatrixStore {
 	float pixsize;
 };
 
-void *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d)
+struct RV3DMatrixStore *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d)
 {
 	struct RV3DMatrixStore *rv3dmat = MEM_mallocN(sizeof(*rv3dmat), __func__);
 	copy_m4_m4(rv3dmat->winmat, rv3d->winmat);
@@ -2968,9 +3050,8 @@ void *ED_view3d_mats_rv3d_backup(struct RegionView3D *rv3d)
 	return (void *)rv3dmat;
 }
 
-void ED_view3d_mats_rv3d_restore(struct RegionView3D *rv3d, void *rv3dmat_pt)
+void ED_view3d_mats_rv3d_restore(struct RegionView3D *rv3d, struct RV3DMatrixStore *rv3dmat)
 {
-	struct RV3DMatrixStore *rv3dmat = rv3dmat_pt;
 	copy_m4_m4(rv3d->winmat, rv3dmat->winmat);
 	copy_m4_m4(rv3d->viewmat, rv3dmat->viewmat);
 	copy_m4_m4(rv3d->persmat, rv3dmat->persmat);
diff --git a/source/blender/editors/space_view3d/view3d_edit.c b/source/blender/editors/space_view3d/view3d_edit.c
index 2b53eb71d99..f07727f8118 100644
--- a/source/blender/editors/space_view3d/view3d_edit.c
+++ b/source/blender/editors/space_view3d/view3d_edit.c
@@ -90,19 +90,6 @@ bool ED_view3d_offset_lock_check(const  View3D *v3d, const  RegionView3D *rv3d)
 	return (rv3d->persp != RV3D_CAMOB) && (v3d->ob_centre_cursor || v3d->ob_centre);
 }
 
-static bool view3d_operator_offset_lock_check(bContext *C, wmOperator *op)
-{
-	View3D *v3d = CTX_wm_view3d(C);
-	RegionView3D *rv3d = CTX_wm_region_view3d(C);
-	if (ED_view3d_offset_lock_check(v3d, rv3d)) {
-		BKE_report(op->reports, RPT_WARNING, "View offset is locked");
-		return true;
-	}
-	else {
-		return false;
-	}
-}
-
 /* ********************** view3d_edit: view manipulations ********************* */
 
 /**
@@ -2596,6 +2583,19 @@ void VIEW3D_OT_zoom(wmOperatorType *ot)
 
 
 /* ************************ viewdolly ******************************** */
+static bool viewdolly_offset_lock_check(bContext *C, wmOperator *op)
+{
+	View3D *v3d = CTX_wm_view3d(C);
+	RegionView3D *rv3d = CTX_wm_region_view3d(C);
+	if (ED_view3d_offset_lock_check(v3d, rv3d)) {
+		BKE_report(op->reports, RPT_WARNING, "Cannot dolly when the view offset is locked");
+		return true;
+	}
+	else {
+		return false;
+	}
+}
+
 static void view_dolly_mouseloc(ARegion *ar, float orig_ofs[3], float dvec[3], float dfac)
 {
 	RegionView3D *rv3d = ar->regiondata;
@@ -2746,7 +2746,7 @@ static int viewdolly_invoke(bContext *C, wmOperator *op, const wmEvent *event)
 {
 	ViewOpsData *vod;
 
-	if (view3d_operator_offset_lock_check(C, op))
+	if (viewdolly_offset_lock_check(C, op))
 		return OPERATOR_CANCELLED;
 
 	/* makes op->customdata */
@@ -4364,41 +4364,24 @@ static EnumPropertyItem prop_view_pan_items[] = {
 	{0, NULL, 0, NULL, NULL}
 };
 
-static int viewpan_exec(bContext *C, wmOperator *op)
+static int viewpan_invoke(bContext *C, wmOperator *op, const wmEvent *event)
 {
-	ScrArea *sa = CTX_wm_area(C);
-	ARegion *ar = CTX_wm_region(C);
-	View3D *v3d = CTX_wm_view3d(C);
-	RegionView3D *rv3d = CTX_wm_region_view3d(C);
-	float vec[3];
-	const float co_zero[3] = {0.0f};
-	float mval_f[2] = {0.0f, 0.0f};
-	float zfac;
-	int pandir;
+	int x = 0, y = 0;
+	int pandir = RNA_enum_get(op->ptr, "type");
 
-	if (view3d_operator_offset_lock_check(C, op))
-		return OPERATOR_CANCELLED;
+	if      (pandir == V3D_VIEW_PANRIGHT)  { x = -32; }
+	else if (pandir == V3D_VIEW_PANLEFT)   { x =  32; }
+	else if (pandir == V3D_VIEW_PANUP)     { y = -25; }
+	else if (pandir == V3D_VIEW_PANDOWN)   { y =  25; }
 
-	pandir = RNA_enum_get(op->ptr, "type");
-
-	ED_view3d_camera_lock_init(v3d, rv3d);
-
-	zfac = ED_view3d_calc_zfac(rv3d, co_zero, NULL);
-	if      (pandir == V3D_VIEW_PANRIGHT)  { mval_f[0] = -32.0f; }
-	else if (pandir == V3D_VIEW_PANLEFT)   { mval_f[0] =  32.0f; }
-	else if (pandir == V3D_VIEW_PANUP)     { mval_f[1] = -25.0f; }
-	else if (pandir == V3D_VIEW_PANDOWN)   { mval_f[1] =  25.0f; }
-	ED_view3d_win_to_delta(ar, mval_f, vec, zfac);
-	add_v3_v3(rv3d->ofs, vec);
-
-	if (rv3d->viewlock & RV3D_BOXVIEW)
-		view3d_boxview_sync(sa, ar);
-
-	ED_view3d_depth_tag_update(rv3d);
+	viewops_data_alloc(C, op);
+	viewops_data_create(C, op, event);
+	ViewOpsData *vod = op->customdata;
 
-	ED_view3d_camera_lock_sync(v3d, rv3d);
+	viewmove_apply(vod, vod->oldx + x, vod->oldy + y);
 
-	ED_region_tag_redraw(ar);
+	ED_view3d_depth_tag_update(vod->rv3d);
+	viewops_data_free(C, op);
 
 	return OPERATOR_FINISHED;
 }
@@ -4411,7 +4394,7 @@ void VIEW3D_OT_view_pan(wmOperatorType *ot)
 	ot->idname = "VIEW3D_OT_view_pan";
 
 	/* api callbacks */
-	ot->exec = viewpan_exec;
+	ot->invoke = viewpan_invoke;
 	ot->poll = ED_operator_region_view3d_active;
 
 	/* flags */
@@ -4798,6 +4781,7 @@ static int manipulator_invoke(bContext *C, wmOperator *op, const wmEvent *event)
 
 void VIEW3D_OT_manipulator(wmOperatorType *ot)
 {
+	PropertyRNA *prop;
 
 	/* identifiers */
 	ot->name = "3D Manipulator";
@@ -4812,8 +4796,9 @@ void VIEW3D_OT_manipulator(wmOperatorType *ot)
 	/* properties to pass to transform */
 	Transform_Properties(ot, P_CONSTRAINT);
 
-	RNA_def_boolean(ot->srna, "use_planar_constraint", false, "Planar Constraint", "Limit the transformation to the "
-	                "two axes that have not been clicked (translate/scale only)");
+	prop = RNA_def_boolean(ot->srna, "use_planar_constraint", false, "Planar Constraint", "Limit the transformation to the "
+	                       "two axes that have not been clicked (translate/scale only)");
+	RNA_def_property_flag(prop, PROP_SKIP_SAVE | PROP_HIDDEN);
 }
 
 static int enable_manipulator_invoke(bContext *C, wmOperator *op, const wmEvent *UNUSED(event))
@@ -4902,11 +4887,7 @@ static float view_autodist_depth_margin(ARegion *ar, const int mval[2], int marg
 		rect.ymax = mval[1] + 1;
 	}
 	else {
-		rect.xmax = mval[0] + margin;
-		rect.ymax = mval[1] + margin;
-
-		rect.xmin = mval[0] - margin;
-		rect.ymin = mval[1] - margin;
+		BLI_rcti_init_pt_radius(&rect, mval, margin);
 	}
 
 	view3d_update_depths_rect(ar, &depth_temp, &rect);
diff --git a/source/blender/editors/space_view3d/view3d_intern.h b/source/blender/editors/space_view3d/view3d_intern.h
index b11f42bcfef..c2b8d1f8bda 100644
--- a/source/blender/editors/space_view3d/view3d_intern.h
+++ b/source/blender/editors/space_view3d/view3d_intern.h
@@ -143,6 +143,8 @@ void draw_motion_paths_cleanup(View3D *v3d);
 
 /* drawobject.c */
 void draw_object(Scene *scene, struct ARegion *ar, View3D *v3d, Base *base, const short dflag);
+void draw_object_select(Scene *scene, ARegion *ar, View3D *v3d, Base *base, const short dflag);
+
 bool draw_glsl_material(Scene *scene, struct Object *ob, View3D *v3d, const char dt);
 void draw_object_instance(Scene *scene, View3D *v3d, RegionView3D *rv3d, struct Object *ob, const char dt, int outline);
 void draw_object_backbufsel(Scene *scene, View3D *v3d, RegionView3D *rv3d, struct Object *ob);
@@ -195,7 +197,11 @@ void draw_sim_debug_data(Scene *scene, View3D *v3d, ARegion *ar);
 void view3d_main_region_draw(const struct bContext *C, struct ARegion *ar);
 void ED_view3d_draw_depth(Scene *scene, struct ARegion *ar, View3D *v3d, bool alphaoverride);
 void ED_view3d_draw_depth_gpencil(Scene *scene, ARegion *ar, View3D *v3d);
-void ED_view3d_after_add(ListBase *lb, Base *base, const short dflag);
+void ED_view3d_draw_select_loop(
+        ViewContext *vc, Scene *scene, View3D *v3d, ARegion *ar,
+        bool use_obedit_skip, bool use_nearest);
+
+void ED_view3d_after_add(ListBase *lb, Base *base, const short dflag);\
 
 void circf(float x, float y, float rad);
 void circ(float x, float y, float rad);
@@ -241,7 +247,7 @@ void ED_view3d_smooth_view_force_finish(
         struct bContext *C,
         struct View3D *v3d, struct ARegion *ar);
 
-void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rctf *rect);
+void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rcti *rect);
 void view3d_viewmatrix_set(Scene *scene, const View3D *v3d, RegionView3D *rv3d);
 
 void fly_modal_keymap(struct wmKeyConfig *keyconf);
diff --git a/source/blender/editors/space_view3d/view3d_ops.c b/source/blender/editors/space_view3d/view3d_ops.c
index 0fa6841fe27..d71639c35d2 100644
--- a/source/blender/editors/space_view3d/view3d_ops.c
+++ b/source/blender/editors/space_view3d/view3d_ops.c
@@ -241,13 +241,21 @@ void view3d_keymap(wmKeyConfig *keyconf)
 	keymap = WM_keymap_find(keyconf, "3D View", SPACE_VIEW3D, 0);
 
 	/* Shift+LMB behavior first, so it has priority over KM_ANY item below. */
-	kmi = WM_keymap_verify_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
+	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
 	RNA_boolean_set(kmi->ptr, "release_confirm", true);
 	RNA_boolean_set(kmi->ptr, "use_planar_constraint", true);
+	RNA_boolean_set(kmi->ptr, "use_accurate", false);
+
+	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_SHIFT, 0);
+	RNA_boolean_set(kmi->ptr, "release_confirm", true);
+	RNA_boolean_set(kmi->ptr, "use_planar_constraint", false);
+	RNA_boolean_set(kmi->ptr, "use_accurate", true);
+
 	/* Using KM_ANY here to allow holding modifiers before starting to transform. */
 	kmi = WM_keymap_add_item(keymap, "VIEW3D_OT_manipulator", LEFTMOUSE, KM_PRESS, KM_ANY, 0);
 	RNA_boolean_set(kmi->ptr, "release_confirm", true);
 	RNA_boolean_set(kmi->ptr, "use_planar_constraint", false);
+	RNA_boolean_set(kmi->ptr, "use_accurate", false);
 
 	WM_keymap_verify_item(keymap, "VIEW3D_OT_cursor3d", ACTIONMOUSE, KM_PRESS, 0, 0);
 	
diff --git a/source/blender/editors/space_view3d/view3d_select.c b/source/blender/editors/space_view3d/view3d_select.c
index 3239d07553f..7d927766bbd 100644
--- a/source/blender/editors/space_view3d/view3d_select.c
+++ b/source/blender/editors/space_view3d/view3d_select.c
@@ -98,6 +98,8 @@
 
 #include "view3d_intern.h"  /* own include */
 
+// #include "PIL_time_utildefines.h"
+
 float ED_view3d_select_dist_px(void)
 {
 	return 75.0f * U.pixelsize;
@@ -1087,7 +1089,9 @@ static void deselectall_except(Scene *scene, Base *b)   /* deselect all except b
 	}
 }
 
-static Base *object_mouse_select_menu(bContext *C, ViewContext *vc, unsigned int *buffer, int hits, const int mval[2], short toggle)
+static Base *object_mouse_select_menu(
+        bContext *C, ViewContext *vc, unsigned int *buffer, int hits,
+        const int mval[2], bool toggle)
 {
 	short baseCount = 0;
 	bool ok;
@@ -1178,19 +1182,19 @@ static bool selectbuffer_has_bones(const unsigned int *buffer, const unsigned in
 }
 
 /* utility function for mixed_bones_object_selectbuffer */
-static short selectbuffer_ret_hits_15(unsigned int *UNUSED(buffer), const short hits15)
+static int selectbuffer_ret_hits_15(unsigned int *UNUSED(buffer), const int hits15)
 {
 	return hits15;
 }
 
-static short selectbuffer_ret_hits_9(unsigned int *buffer, const short hits15, const short hits9)
+static int selectbuffer_ret_hits_9(unsigned int *buffer, const int hits15, const int hits9)
 {
 	const int offs = 4 * hits15;
 	memcpy(buffer, buffer + offs, 4 * hits9 * sizeof(unsigned int));
 	return hits9;
 }
 
-static short selectbuffer_ret_hits_5(unsigned int *buffer, const short hits15, const short hits9, const short hits5)
+static int selectbuffer_ret_hits_5(unsigned int *buffer, const int hits15, const int hits9, const int hits5)
 {
 	const int offs = 4 * hits15 + 4 * hits9;
 	memcpy(buffer, buffer + offs, 4 * hits5  * sizeof(unsigned int));
@@ -1199,14 +1203,13 @@ static short selectbuffer_ret_hits_5(unsigned int *buffer, const short hits15, c
 
 /* we want a select buffer with bones, if there are... */
 /* so check three selection levels and compare */
-static short mixed_bones_object_selectbuffer(
+static int mixed_bones_object_selectbuffer(
         ViewContext *vc, unsigned int *buffer, const int mval[2],
         bool use_cycle, bool enumerate,
         bool *r_do_nearest)
 {
 	rcti rect;
-	int offs;
-	short hits15, hits9 = 0, hits5 = 0;
+	int hits15, hits9 = 0, hits5 = 0;
 	bool has_bones15 = false, has_bones9 = false, has_bones5 = false;
 	static int last_mval[2] = {-100, -100};
 	bool do_nearest = false;
@@ -1234,44 +1237,57 @@ static short mixed_bones_object_selectbuffer(
 
 	do_nearest = do_nearest && !enumerate;
 
-	BLI_rcti_init(&rect, mval[0] - 14, mval[0] + 14, mval[1] - 14, mval[1] + 14);
-	hits15 = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, do_nearest);
+	const int select_mode = (do_nearest ? VIEW3D_SELECT_PICK_NEAREST : VIEW3D_SELECT_PICK_ALL);
+	int hits = 0;
+
+	/* we _must_ end cache before return, use 'goto finally' */
+	view3d_opengl_select_cache_begin();
+
+	BLI_rcti_init_pt_radius(&rect, mval, 14);
+	hits15 = view3d_opengl_select(vc, buffer, MAXPICKBUF, &rect, select_mode);
 	if (hits15 == 1) {
-		return selectbuffer_ret_hits_15(buffer, hits15);
+		hits = selectbuffer_ret_hits_15(buffer, hits15);
+		goto finally;
 	}
 	else if (hits15 > 0) {
+		int offs;
 		has_bones15 = selectbuffer_has_bones(buffer, hits15);
 
 		offs = 4 * hits15;
-		BLI_rcti_init(&rect, mval[0] - 9, mval[0] + 9, mval[1] - 9, mval[1] + 9);
-		hits9 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, do_nearest);
+		BLI_rcti_init_pt_radius(&rect, mval, 9);
+		hits9 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, select_mode);
 		if (hits9 == 1) {
-			return selectbuffer_ret_hits_9(buffer, hits15, hits9);
+			hits = selectbuffer_ret_hits_9(buffer, hits15, hits9);
+			goto finally;
 		}
 		else if (hits9 > 0) {
 			has_bones9 = selectbuffer_has_bones(buffer + offs, hits9);
 
 			offs += 4 * hits9;
-			BLI_rcti_init(&rect, mval[0] - 5, mval[0] + 5, mval[1] - 5, mval[1] + 5);
-			hits5 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, do_nearest);
+			BLI_rcti_init_pt_radius(&rect, mval, 5);
+			hits5 = view3d_opengl_select(vc, buffer + offs, MAXPICKBUF - offs, &rect, select_mode);
 			if (hits5 == 1) {
-				return selectbuffer_ret_hits_5(buffer, hits15, hits9, hits5);
+				hits = selectbuffer_ret_hits_5(buffer, hits15, hits9, hits5);
+				goto finally;
 			}
 			else if (hits5 > 0) {
 				has_bones5 = selectbuffer_has_bones(buffer + offs, hits5);
 			}
 		}
 
-		if      (has_bones5)  return selectbuffer_ret_hits_5(buffer,  hits15, hits9, hits5);
-		else if (has_bones9)  return selectbuffer_ret_hits_9(buffer,  hits15, hits9);
-		else if (has_bones15) return selectbuffer_ret_hits_15(buffer, hits15);
-		
-		if      (hits5 > 0) return selectbuffer_ret_hits_5(buffer,  hits15, hits9, hits5);
-		else if (hits9 > 0) return selectbuffer_ret_hits_9(buffer,  hits15, hits9);
-		else                return selectbuffer_ret_hits_15(buffer, hits15);
+		if      (has_bones5)  { hits = selectbuffer_ret_hits_5(buffer,  hits15, hits9, hits5); goto finally; }
+		else if (has_bones9)  { hits = selectbuffer_ret_hits_9(buffer,  hits15, hits9); goto finally; }
+		else if (has_bones15) { hits = selectbuffer_ret_hits_15(buffer, hits15); goto finally; }
+
+		if      (hits5 > 0) { hits = selectbuffer_ret_hits_5(buffer,  hits15, hits9, hits5); goto finally; }
+		else if (hits9 > 0) { hits = selectbuffer_ret_hits_9(buffer,  hits15, hits9); goto finally; }
+		else                { hits = selectbuffer_ret_hits_15(buffer, hits15); goto finally; }
 	}
-	
-	return 0;
+
+finally:
+	view3d_opengl_select_cache_end();
+
+	return hits;
 }
 
 /* returns basact */
@@ -1412,7 +1428,7 @@ static bool ed_object_select_pick(
 	bool is_obedit;
 	float dist = ED_view3d_select_dist_px() * 1.3333f;
 	bool retval = false;
-	short hits;
+	int hits;
 	const float mval_fl[2] = {(float)mval[0], (float)mval[1]};
 
 	
@@ -1464,10 +1480,13 @@ static bool ed_object_select_pick(
 		unsigned int buffer[MAXPICKBUF];
 		bool do_nearest;
 
+		// TIMEIT_START(select_time);
+
 		/* if objects have posemode set, the bones are in the same selection buffer */
-		
 		hits = mixed_bones_object_selectbuffer(&vc, buffer, mval, true, enumerate, &do_nearest);
-		
+
+		// TIMEIT_END(select_time);
+
 		if (hits > 0) {
 			/* note: bundles are handling in the same way as bones */
 			const bool has_bones = selectbuffer_has_bones(buffer, hits);
@@ -1904,9 +1923,9 @@ static int do_meta_box_select(ViewContext *vc, rcti *rect, bool select, bool ext
 	int a;
 
 	unsigned int buffer[MAXPICKBUF];
-	short hits;
+	int hits;
 
-	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, false);
+	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, VIEW3D_SELECT_ALL);
 
 	if (extend == false && select)
 		BKE_mball_deselect_all(mb);
@@ -1938,9 +1957,9 @@ static int do_armature_box_select(ViewContext *vc, rcti *rect, bool select, bool
 	int a;
 
 	unsigned int buffer[MAXPICKBUF];
-	short hits;
+	int hits;
 
-	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, false);
+	hits = view3d_opengl_select(vc, buffer, MAXPICKBUF, rect, VIEW3D_SELECT_ALL);
 	
 	/* clear flag we use to detect point was affected */
 	for (ebone = arm->edbo->first; ebone; ebone = ebone->next)
@@ -2013,7 +2032,7 @@ static int do_object_pose_box_select(bContext *C, ViewContext *vc, rcti *rect, b
 	int bone_only;
 	int bone_selected = 0;
 	int totobj = MAXPICKBUF; /* XXX solve later */
-	short hits;
+	int hits;
 	
 	if ((ob) && (ob->mode & OB_MODE_POSE))
 		bone_only = 1;
@@ -2037,7 +2056,7 @@ static int do_object_pose_box_select(bContext *C, ViewContext *vc, rcti *rect, b
 
 	/* selection buffer now has bones potentially too, so we add MAXPICKBUF */
 	vbuffer = MEM_mallocN(4 * (totobj + MAXPICKELEMS) * sizeof(unsigned int), "selection buffer");
-	hits = view3d_opengl_select(vc, vbuffer, 4 * (totobj + MAXPICKELEMS), rect, false);
+	hits = view3d_opengl_select(vc, vbuffer, 4 * (totobj + MAXPICKELEMS), rect, VIEW3D_SELECT_ALL);
 	/*
 	 * LOGIC NOTES (theeth):
 	 * The buffer and ListBase have the same relative order, which makes the selection
@@ -2577,7 +2596,7 @@ static void lattice_circle_select(ViewContext *vc, const bool select, const int
 
 
 /* NOTE: pose-bone case is copied from editbone case... */
-static short pchan_circle_doSelectJoint(void *userData, bPoseChannel *pchan, const float screen_co[2])
+static bool pchan_circle_doSelectJoint(void *userData, bPoseChannel *pchan, const float screen_co[2])
 {
 	CircleSelectUserData *data = userData;
 
@@ -2655,7 +2674,7 @@ static void pose_circle_select(ViewContext *vc, const bool select, const int mva
 	}
 }
 
-static short armature_circle_doSelectJoint(void *userData, EditBone *ebone, const float screen_co[2], short head)
+static bool armature_circle_doSelectJoint(void *userData, EditBone *ebone, const float screen_co[2], bool head)
 {
 	CircleSelectUserData *data = userData;
 
diff --git a/source/blender/editors/space_view3d/view3d_view.c b/source/blender/editors/space_view3d/view3d_view.c
index 8582952d1a0..fe8274064e5 100644
--- a/source/blender/editors/space_view3d/view3d_view.c
+++ b/source/blender/editors/space_view3d/view3d_view.c
@@ -908,7 +908,7 @@ void ED_view3d_polygon_offset(const RegionView3D *rv3d, const float dist)
 /**
  * \param rect optional for picking (can be NULL).
  */
-void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rctf *rect)
+void view3d_winmatrix_set(ARegion *ar, const View3D *v3d, const rcti *rect)
 {
 	RegionView3D *rv3d = ar->regiondata;
 	rctf viewplane;
@@ -1091,76 +1091,19 @@ void view3d_viewmatrix_set(Scene *scene, const View3D *v3d, RegionView3D *rv3d)
 	}
 }
 
-static void view3d_select_loop(ViewContext *vc, Scene *scene, View3D *v3d, ARegion *ar, bool use_obedit_skip)
+/**
+ * Optionally cache data for multiple calls to #view3d_opengl_select
+ *
+ * just avoid GPU_select headers outside this file
+ */
+void view3d_opengl_select_cache_begin(void)
 {
-	short code = 1;
-	char dt;
-	short dtx;
-
-	if (vc->obedit && vc->obedit->type == OB_MBALL) {
-		draw_object(scene, ar, v3d, BASACT, DRAW_PICKING | DRAW_CONSTCOLOR);
-	}
-	else if ((vc->obedit && vc->obedit->type == OB_ARMATURE)) {
-		/* if not drawing sketch, draw bones */
-		if (!BDR_drawSketchNames(vc)) {
-			draw_object(scene, ar, v3d, BASACT, DRAW_PICKING | DRAW_CONSTCOLOR);
-		}
-	}
-	else {
-		Base *base;
-
-		v3d->xray = true;  /* otherwise it postpones drawing */
-		for (base = scene->base.first; base; base = base->next) {
-			if (base->lay & v3d->lay) {
-
-				if ((base->object->restrictflag & OB_RESTRICT_SELECT) ||
-				    (use_obedit_skip && (scene->obedit->data == base->object->data)))
-				{
-					base->selcol = 0;
-				}
-				else {
-					base->selcol = code;
-
-					if (GPU_select_load_id(code)) {
-						draw_object(scene, ar, v3d, base, DRAW_PICKING | DRAW_CONSTCOLOR);
-
-						/* we draw duplicators for selection too */
-						if ((base->object->transflag & OB_DUPLI)) {
-							ListBase *lb;
-							DupliObject *dob;
-							Base tbase;
-
-							tbase.flag = OB_FROMDUPLI;
-							lb = object_duplilist(G.main->eval_ctx, scene, base->object);
-
-							for (dob = lb->first; dob; dob = dob->next) {
-								float omat[4][4];
-
-								tbase.object = dob->ob;
-								copy_m4_m4(omat, dob->ob->obmat);
-								copy_m4_m4(dob->ob->obmat, dob->mat);
-
-								/* extra service: draw the duplicator in drawtype of parent */
-								/* MIN2 for the drawtype to allow bounding box objects in groups for lods */
-								dt = tbase.object->dt;   tbase.object->dt = MIN2(tbase.object->dt, base->object->dt);
-								dtx = tbase.object->dtx; tbase.object->dtx = base->object->dtx;
-
-								draw_object(scene, ar, v3d, &tbase, DRAW_PICKING | DRAW_CONSTCOLOR);
-
-								tbase.object->dt = dt;
-								tbase.object->dtx = dtx;
+	GPU_select_cache_begin();
+}
 
-								copy_m4_m4(dob->ob->obmat, omat);
-							}
-							free_object_duplilist(lb);
-						}
-					}
-					code++;
-				}
-			}
-		}
-		v3d->xray = false;  /* restore */
-	}
+void view3d_opengl_select_cache_end(void)
+{
+	GPU_select_cache_end();
 }
 
 /**
@@ -1170,29 +1113,65 @@ static void view3d_select_loop(ViewContext *vc, Scene *scene, View3D *v3d, ARegi
  *
  * \note (vc->obedit == NULL) can be set to explicitly skip edit-object selection.
  */
-short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int bufsize, const rcti *input, bool do_nearest)
+int view3d_opengl_select(
+        ViewContext *vc, unsigned int *buffer, unsigned int bufsize, const rcti *input,
+        eV3DSelectMode select_mode)
 {
 	Scene *scene = vc->scene;
 	View3D *v3d = vc->v3d;
 	ARegion *ar = vc->ar;
-	rctf rect;
-	short hits;
+	rcti rect;
+	int hits;
 	const bool use_obedit_skip = (scene->obedit != NULL) && (vc->obedit == NULL);
-	const bool do_passes = do_nearest && GPU_select_query_check_active();
+	const bool is_pick_select = (U.gpu_select_pick_deph != 0);
+	const bool do_passes = (
+	        (is_pick_select == false) &&
+	        (select_mode == VIEW3D_SELECT_PICK_NEAREST) &&
+	        GPU_select_query_check_active());
+	const bool use_nearest = (is_pick_select && select_mode == VIEW3D_SELECT_PICK_NEAREST);
+
+	char gpu_select_mode;
 
-	G.f |= G_PICKSEL;
-	
 	/* case not a border select */
 	if (input->xmin == input->xmax) {
-		rect.xmin = input->xmin - 12;  /* seems to be default value for bones only now */
-		rect.xmax = input->xmin + 12;
-		rect.ymin = input->ymin - 12;
-		rect.ymax = input->ymin + 12;
+		/* seems to be default value for bones only now */
+		BLI_rcti_init_pt_radius(&rect, (const int[2]){input->xmin, input->ymin}, 12);
 	}
 	else {
-		BLI_rctf_rcti_copy(&rect, input);
+		rect = *input;
 	}
-	
+
+	if (is_pick_select) {
+		if (is_pick_select && select_mode == VIEW3D_SELECT_PICK_NEAREST) {
+			gpu_select_mode = GPU_SELECT_PICK_NEAREST;
+		}
+		else if (is_pick_select && select_mode == VIEW3D_SELECT_PICK_ALL) {
+			gpu_select_mode = GPU_SELECT_PICK_ALL;
+		}
+		else {
+			gpu_select_mode = GPU_SELECT_ALL;
+		}
+	}
+	else {
+		if (do_passes) {
+			gpu_select_mode = GPU_SELECT_NEAREST_FIRST_PASS;
+		}
+		else {
+			gpu_select_mode = GPU_SELECT_ALL;
+		}
+	}
+
+	/* Re-use cache (rect must be smaller then the cached)
+	 * other context is assumed to be unchanged */
+	if (GPU_select_is_cached()) {
+		GPU_select_begin(buffer, bufsize, &rect, gpu_select_mode, 0);
+		GPU_select_cache_load_id();
+		hits = GPU_select_end();
+		goto finally;
+	}
+
+	G.f |= G_PICKSEL;
+
 	view3d_winmatrix_set(ar, v3d, &rect);
 	mul_m4_m4m4(vc->rv3d->persmat, vc->rv3d->winmat, vc->rv3d->viewmat);
 	
@@ -1204,12 +1183,9 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b
 	if (vc->rv3d->rflag & RV3D_CLIPPING)
 		ED_view3d_clipping_set(vc->rv3d);
 	
-	if (do_passes)
-		GPU_select_begin(buffer, bufsize, &rect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
-	else
-		GPU_select_begin(buffer, bufsize, &rect, GPU_SELECT_ALL, 0);
+	GPU_select_begin(buffer, bufsize, &rect, gpu_select_mode, 0);
 
-	view3d_select_loop(vc, scene, v3d, ar, use_obedit_skip);
+	ED_view3d_draw_select_loop(vc, scene, v3d, ar, use_obedit_skip, use_nearest);
 
 	hits = GPU_select_end();
 	
@@ -1217,7 +1193,7 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b
 	if (do_passes) {
 		GPU_select_begin(buffer, bufsize, &rect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
 
-		view3d_select_loop(vc, scene, v3d, ar, use_obedit_skip);
+		ED_view3d_draw_select_loop(vc, scene, v3d, ar, use_obedit_skip, use_nearest);
 
 		GPU_select_end();
 	}
@@ -1233,7 +1209,8 @@ short view3d_opengl_select(ViewContext *vc, unsigned int *buffer, unsigned int b
 	
 	if (vc->rv3d->rflag & RV3D_CLIPPING)
 		ED_view3d_clipping_disable();
-	
+
+finally:
 	if (hits < 0) printf("Too many objects in select buffer\n");  /* XXX make error message */
 
 	return hits;
@@ -1428,6 +1405,8 @@ static bool view3d_localview_init(
 		}
 	}
 
+	DAG_on_visible_update(bmain, false);
+
 	return ok;
 }
 
diff --git a/source/blender/editors/transform/transform.c b/source/blender/editors/transform/transform.c
index 1916f9b4dab..7d9063c3285 100644
--- a/source/blender/editors/transform/transform.c
+++ b/source/blender/editors/transform/transform.c
@@ -2176,7 +2176,14 @@ bool initTransform(bContext *C, TransInfo *t, wmOperator *op, const wmEvent *eve
 	calculateCenter(t);
 
 	if (event) {
-		initMouseInput(t, &t->mouse, t->center2d, event->mval, event->shift);
+		/* Initialize accurate transform to settings requested by keymap. */
+		bool use_accurate = false;
+		if ((prop = RNA_struct_find_property(op->ptr, "use_accurate")) && RNA_property_is_set(op->ptr, prop)) {
+			if (RNA_property_boolean_get(op->ptr, prop)) {
+				use_accurate = true;
+			}
+		}
+		initMouseInput(t, &t->mouse, t->center2d, event->mval, use_accurate);
 	}
 
 	switch (mode) {
diff --git a/source/blender/editors/transform/transform_conversions.c b/source/blender/editors/transform/transform_conversions.c
index 091a5773cf0..5e67f304755 100644
--- a/source/blender/editors/transform/transform_conversions.c
+++ b/source/blender/editors/transform/transform_conversions.c
@@ -1973,9 +1973,12 @@ void flushTransParticles(TransInfo *t)
 
 /* ********************* mesh ****************** */
 
-static bool bmesh_test_dist_add(BMVert *v, BMVert *v_other,
-                                float *dists, const float *dists_prev,
-                                float mtx[3][3])
+static bool bmesh_test_dist_add(
+        BMVert *v, BMVert *v_other,
+        float *dists, const float *dists_prev,
+        /* optionally track original index */
+        int *index, const int *index_prev,
+        float mtx[3][3])
 {
 	if ((BM_elem_flag_test(v_other, BM_ELEM_SELECT) == 0) &&
 	    (BM_elem_flag_test(v_other, BM_ELEM_HIDDEN) == 0))
@@ -1990,6 +1993,9 @@ static bool bmesh_test_dist_add(BMVert *v, BMVert *v_other,
 		dist_other = dists_prev[i] + len_v3(vec);
 		if (dist_other < dists[i_other]) {
 			dists[i_other] = dist_other;
+			if (index != NULL) {
+				index[i_other] = index_prev[i];
+			}
 			return true;
 		}
 	}
@@ -1997,11 +2003,13 @@ static bool bmesh_test_dist_add(BMVert *v, BMVert *v_other,
 	return false;
 }
 
-static void editmesh_set_connectivity_distance(BMesh *bm, float mtx[3][3], float *dists)
+/**
+ * \parm mtx: Measure disatnce in this space.
+ * \parm dists: Store the closest connected distance to selected vertices.
+ * \parm index: Optionally store the original index we're measuring the distance to (can be NULL).
+ */
+static void editmesh_set_connectivity_distance(BMesh *bm, float mtx[3][3], float *dists, int *index)
 {
-	/* need to be very careful of feedback loops here, store previous dist's to avoid feedback */
-	float *dists_prev = MEM_mallocN(bm->totvert * sizeof(float), __func__);
-
 	BLI_LINKSTACK_DECLARE(queue, BMVert *);
 
 	/* any BM_ELEM_TAG'd vertex is in 'queue_next', so we don't add in twice */
@@ -2022,17 +2030,27 @@ static void editmesh_set_connectivity_distance(BMesh *bm, float mtx[3][3], float
 
 			if (BM_elem_flag_test(v, BM_ELEM_SELECT) == 0 || BM_elem_flag_test(v, BM_ELEM_HIDDEN)) {
 				dist = FLT_MAX;
+				if (index != NULL) {
+					index[i] = i;
+				}
 			}
 			else {
 				BLI_LINKSTACK_PUSH(queue, v);
 				dist = 0.0f;
+				if (index != NULL) {
+					index[i] = i;
+				}
 			}
 
-			dists[i] = dists_prev[i] = dist;
+			dists[i] = dist;
 		}
 		bm->elem_index_dirty &= ~BM_VERT;
 	}
 
+	/* need to be very careful of feedback loops here, store previous dist's to avoid feedback */
+	float *dists_prev = MEM_dupallocN(dists);
+	int *index_prev = MEM_dupallocN(index);  /* may be NULL */
+
 	do {
 		BMVert *v;
 		LinkNode *lnk;
@@ -2061,7 +2079,7 @@ static void editmesh_set_connectivity_distance(BMesh *bm, float mtx[3][3], float
 						/* edge distance */
 						{
 							BMVert *v_other = BM_edge_other_vert(e_iter, v);
-							if (bmesh_test_dist_add(v, v_other, dists, dists_prev, mtx)) {
+							if (bmesh_test_dist_add(v, v_other, dists, dists_prev, index, index_prev, mtx)) {
 								if (BM_elem_flag_test(v_other, BM_ELEM_TAG) == 0) {
 									BM_elem_flag_enable(v_other, BM_ELEM_TAG);
 									BLI_LINKSTACK_PUSH(queue_next, v_other);
@@ -2086,7 +2104,7 @@ static void editmesh_set_connectivity_distance(BMesh *bm, float mtx[3][3], float
 								    (BM_elem_flag_test(l_iter_radial->f, BM_ELEM_HIDDEN) == 0))
 								{
 									BMVert *v_other = l_iter_radial->next->next->v;
-									if (bmesh_test_dist_add(v, v_other, dists, dists_prev, mtx)) {
+									if (bmesh_test_dist_add(v, v_other, dists, dists_prev, index, index_prev, mtx)) {
 										if (BM_elem_flag_test(v_other, BM_ELEM_TAG) == 0) {
 											BM_elem_flag_enable(v_other, BM_ELEM_TAG);
 											BLI_LINKSTACK_PUSH(queue_next, v_other);
@@ -2110,6 +2128,9 @@ static void editmesh_set_connectivity_distance(BMesh *bm, float mtx[3][3], float
 
 			/* keep in sync, avoid having to do full memcpy each iteration */
 			dists_prev[i] = dists[i];
+			if (index != NULL) {
+				index_prev[i] = index[i];
+			}
 		}
 
 		BLI_LINKSTACK_SWAP(queue, queue_next);
@@ -2123,9 +2144,14 @@ static void editmesh_set_connectivity_distance(BMesh *bm, float mtx[3][3], float
 	BLI_LINKSTACK_FREE(queue_next);
 
 	MEM_freeN(dists_prev);
+	if (index_prev != NULL) {
+		MEM_freeN(index_prev);
+	}
 }
 
-static struct TransIslandData *editmesh_islands_info_calc(BMEditMesh *em, int *r_island_tot, int **r_island_vert_map)
+static struct TransIslandData *editmesh_islands_info_calc(
+        BMEditMesh *em, int *r_island_tot, int **r_island_vert_map,
+        bool calc_single_islands)
 {
 	BMesh *bm = em->bm;
 	struct TransIslandData *trans_islands;
@@ -2237,6 +2263,42 @@ static struct TransIslandData *editmesh_islands_info_calc(BMEditMesh *em, int *r
 	MEM_freeN(groups_array);
 	MEM_freeN(group_index);
 
+	/* for PET we need islands of 1 so connected vertices can use it with V3D_AROUND_LOCAL_ORIGINS */
+	if (calc_single_islands) {
+		BMIter viter;
+		BMVert *v;
+		int group_tot_single = 0;
+
+		BM_ITER_MESH_INDEX (v, &viter, bm, BM_VERTS_OF_MESH, i) {
+			if (BM_elem_flag_test(v, BM_ELEM_SELECT) && (vert_map[i] == -1)) {
+				group_tot_single += 1;
+			}
+		}
+
+		if (group_tot_single != 0) {
+			trans_islands = MEM_reallocN(trans_islands, group_tot + group_tot_single);
+
+			BM_ITER_MESH_INDEX (v, &viter, bm, BM_VERTS_OF_MESH, i) {
+				if (BM_elem_flag_test(v, BM_ELEM_SELECT) && (vert_map[i] == -1)) {
+					struct TransIslandData *v_island = &trans_islands[group_tot];
+					vert_map[i] = group_tot;
+
+					copy_v3_v3(v_island->co, v->co);
+
+					if (is_zero_v3(v->no) != 0.0f) {
+						axis_dominant_v3_to_m3(v_island->axismtx, v->no);
+						invert_m3(v_island->axismtx);
+					}
+					else {
+						unit_m3(v_island->axismtx);
+					}
+
+					group_tot += 1;
+				}
+			}
+		}
+	}
+
 	*r_island_tot = group_tot;
 	*r_island_vert_map = vert_map;
 
@@ -2336,6 +2398,11 @@ static void createTransEditVerts(TransInfo *t)
 	int island_info_tot;
 	int *island_vert_map = NULL;
 
+	const bool is_island_center = (t->around == V3D_AROUND_LOCAL_ORIGINS) && (t->mode != TFM_TRANSLATION);
+	/* Original index of our connected vertex when connected distances are calculated.
+	 * Optional, allocate if needed. */
+	int *dists_index = NULL;
+
 	if (t->flag & T_MIRROR) {
 		EDBM_verts_mirror_cache_begin(em, 0, false, (t->flag & T_PROP_EDIT) == 0, use_topology);
 		mirror = 1;
@@ -2367,8 +2434,12 @@ static void createTransEditVerts(TransInfo *t)
 		t->total = count;
 
 		/* allocating scratch arrays */
-		if (prop_mode & T_PROP_CONNECTED)
-			dists = MEM_mallocN(em->bm->totvert * sizeof(float), "scratch nears");
+		if (prop_mode & T_PROP_CONNECTED) {
+			dists = MEM_mallocN(em->bm->totvert * sizeof(float), __func__);
+			if (is_island_center) {
+				dists_index =  MEM_mallocN(em->bm->totvert * sizeof(int), __func__);
+			}
+		}
 	}
 	else {
 		t->total = bm->totvertsel;
@@ -2390,7 +2461,7 @@ static void createTransEditVerts(TransInfo *t)
 	pseudoinverse_m3_m3(smtx, mtx, PSEUDOINVERSE_EPSILON);
 
 	if (prop_mode & T_PROP_CONNECTED) {
-		editmesh_set_connectivity_distance(em->bm, mtx, dists);
+		editmesh_set_connectivity_distance(em->bm, mtx, dists, dists_index);
 	}
 
 	/* Only in case of rotation and resize, we want the elements of the edited
@@ -2398,8 +2469,14 @@ static void createTransEditVerts(TransInfo *t)
 	 *
 	 * TODO: use island_info to detect the closest point when the "Snap Target"
 	 * in Blender UI is "Closest" */
-	if ((t->around == V3D_AROUND_LOCAL_ORIGINS) && (t->mode != TFM_TRANSLATION)) {
-		island_info = editmesh_islands_info_calc(em, &island_info_tot, &island_vert_map);
+	if (is_island_center) {
+		/* In this specific case, near-by vertices will need to know the island of the nearest connected vertex. */
+		const bool calc_single_islands = (
+		        (prop_mode & T_PROP_CONNECTED) &&
+		        (t->around == V3D_AROUND_LOCAL_ORIGINS) &&
+		        (em->selectmode & SCE_SELECT_VERTEX));
+
+		island_info = editmesh_islands_info_calc(em, &island_info_tot, &island_vert_map, calc_single_islands);
 	}
 
 	/* detect CrazySpace [tm] */
@@ -2449,10 +2526,16 @@ static void createTransEditVerts(TransInfo *t)
 	BM_ITER_MESH_INDEX (eve, &iter, bm, BM_VERTS_OF_MESH, a) {
 		if (!BM_elem_flag_test(eve, BM_ELEM_HIDDEN)) {
 			if (prop_mode || BM_elem_flag_test(eve, BM_ELEM_SELECT)) {
-				struct TransIslandData *v_island = (island_info && island_vert_map[a] != -1) ?
-				                                   &island_info[island_vert_map[a]] : NULL;
+				struct TransIslandData *v_island = NULL;
 				float *bweight = (cd_vert_bweight_offset != -1) ? BM_ELEM_CD_GET_VOID_P(eve, cd_vert_bweight_offset) : NULL;
 
+				if (island_info) {
+					const int connected_index = (dists_index && dists_index[a] != -1) ? dists_index[a] : a;
+					v_island = (island_vert_map[connected_index] != -1) ?
+					           &island_info[island_vert_map[connected_index]] : NULL;
+				}
+
+
 				VertsToTransData(t, tob, tx, em, eve, bweight, v_island);
 				if (tx)
 					tx++;
@@ -2531,6 +2614,8 @@ cleanup:
 		MEM_freeN(defmats);
 	if (dists)
 		MEM_freeN(dists);
+	if (dists_index)
+		MEM_freeN(dists_index);
 
 	if (t->flag & T_MIRROR) {
 		EDBM_verts_mirror_cache_end(em);
@@ -8038,7 +8123,12 @@ void createTransData(bContext *C, TransInfo *t)
 		if (t->data && t->flag & T_PROP_EDIT) {
 			if (ELEM(t->obedit->type, OB_CURVE, OB_MESH)) {
 				sort_trans_data(t); // makes selected become first in array
-				set_prop_dist(t, 0);
+				if ((t->obedit->type == OB_MESH) && (t->flag & T_PROP_CONNECTED)) {
+					/* already calculated by editmesh_set_connectivity_distance */
+				}
+				else {
+					set_prop_dist(t, 0);
+				}
 				sort_trans_data_dist(t);
 			}
 			else {
diff --git a/source/blender/editors/transform/transform_manipulator.c b/source/blender/editors/transform/transform_manipulator.c
index e141724f2df..0a984d90ae3 100644
--- a/source/blender/editors/transform/transform_manipulator.c
+++ b/source/blender/editors/transform/transform_manipulator.c
@@ -1724,14 +1724,14 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
 {
 	View3D *v3d = sa->spacedata.first;
 	RegionView3D *rv3d = ar->regiondata;
-	rctf rect, selrect;
+	rcti rect;
 	GLuint buffer[64];      // max 4 items per select, so large enuf
 	short hits;
 	const bool is_picksel = true;
 	const bool do_passes = GPU_select_query_check_active();
 
 	/* XXX check a bit later on this... (ton) */
-	extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, rctf *rect);
+	extern void view3d_winmatrix_set(ARegion *ar, View3D *v3d, const rcti *rect);
 
 	/* when looking through a selected camera, the manipulator can be at the
 	 * exact same position as the view, skip so we don't break selection */
@@ -1743,15 +1743,13 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
 	rect.ymin = mval[1] - hotspot;
 	rect.ymax = mval[1] + hotspot;
 
-	selrect = rect;
-
 	view3d_winmatrix_set(ar, v3d, &rect);
 	mul_m4_m4m4(rv3d->persmat, rv3d->winmat, rv3d->viewmat);
 
 	if (do_passes)
-		GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
+		GPU_select_begin(buffer, 64, &rect, GPU_SELECT_NEAREST_FIRST_PASS, 0);
 	else
-		GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_ALL, 0);
+		GPU_select_begin(buffer, 64, &rect, GPU_SELECT_ALL, 0);
 
 	/* do the drawing */
 	if (v3d->twtype & V3D_MANIP_ROTATE) {
@@ -1766,7 +1764,7 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
 	hits = GPU_select_end();
 
 	if (do_passes) {
-		GPU_select_begin(buffer, 64, &selrect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
+		GPU_select_begin(buffer, 64, &rect, GPU_SELECT_NEAREST_SECOND_PASS, hits);
 
 		/* do the drawing */
 		if (v3d->twtype & V3D_MANIP_ROTATE) {
@@ -1826,6 +1824,23 @@ static int manipulator_selectbuf(ScrArea *sa, ARegion *ar, const int mval[2], fl
 	return 0;
 }
 
+static const char *manipulator_get_operator_name(int man_val)
+{
+	if (man_val & MAN_TRANS_C) {
+		return "TRANSFORM_OT_translate";
+	}
+	else if (man_val == MAN_ROT_T) {
+		return "TRANSFORM_OT_trackball";
+	}
+	else if (man_val & MAN_ROT_C) {
+		return "TRANSFORM_OT_rotate";
+	}
+	else if (man_val & MAN_SCALE_C) {
+		return "TRANSFORM_OT_resize";
+	}
+
+	return NULL;
+}
 
 /* return 0; nothing happened */
 int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
@@ -1846,11 +1861,24 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
 	// find the hotspots first test narrow hotspot
 	val = manipulator_selectbuf(sa, ar, event->mval, 0.5f * (float)U.tw_hotspot);
 	if (val) {
+		wmOperatorType *ot;
+		PointerRNA props_ptr;
+		PropertyRNA *prop;
+		const char *opname;
 
 		// drawflags still global, for drawing call above
 		drawflags = manipulator_selectbuf(sa, ar, event->mval, 0.2f * (float)U.tw_hotspot);
 		if (drawflags == 0) drawflags = val;
 
+		/* Planar constraint doesn't make sense for rotation, give other keymaps a chance */
+		if ((drawflags & MAN_ROT_C) && use_planar) {
+			return 0;
+		}
+
+		opname = manipulator_get_operator_name(drawflags);
+		ot = WM_operatortype_find(opname, true);
+		WM_operator_properties_create_ptr(&props_ptr, ot);
+
 		if (drawflags & MAN_TRANS_C) {
 			switch (drawflags) {
 				case MAN_TRANS_C:
@@ -1880,8 +1908,7 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
 						constraint_axis[2] = 1;
 					break;
 			}
-			RNA_boolean_set_array(op->ptr, "constraint_axis", constraint_axis);
-			WM_operator_name_call(C, "TRANSFORM_OT_translate", WM_OP_INVOKE_DEFAULT, op->ptr);
+			RNA_boolean_set_array(&props_ptr, "constraint_axis", constraint_axis);
 		}
 		else if (drawflags & MAN_SCALE_C) {
 			switch (drawflags) {
@@ -1910,22 +1937,10 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
 						constraint_axis[2] = 1;
 					break;
 			}
-			RNA_boolean_set_array(op->ptr, "constraint_axis", constraint_axis);
-			WM_operator_name_call(C, "TRANSFORM_OT_resize", WM_OP_INVOKE_DEFAULT, op->ptr);
+			RNA_boolean_set_array(&props_ptr, "constraint_axis", constraint_axis);
 		}
-		else if (drawflags == MAN_ROT_T) { /* trackball need special case, init is different */
-			/* Do not pass op->ptr!!! trackball has no "constraint" properties!
-			 * See [#34621], it's a miracle it did not cause more problems!!! */
-			/* However, we need to copy the "release_confirm" property, but only if defined, see T41112. */
-			PointerRNA props_ptr;
-			PropertyRNA *prop;
-			wmOperatorType *ot = WM_operatortype_find("TRANSFORM_OT_trackball", true);
-			WM_operator_properties_create_ptr(&props_ptr, ot);
-			if ((prop = RNA_struct_find_property(op->ptr, "release_confirm")) && RNA_property_is_set(op->ptr, prop)) {
-				RNA_property_boolean_set(&props_ptr, prop, RNA_property_boolean_get(op->ptr, prop));
-			}
-			WM_operator_name_call_ptr(C, ot, WM_OP_INVOKE_DEFAULT, &props_ptr);
-			WM_operator_properties_free(&props_ptr);
+		else if (drawflags == MAN_ROT_T) {
+			/* pass */
 		}
 		else if (drawflags & MAN_ROT_C) {
 			switch (drawflags) {
@@ -1939,9 +1954,25 @@ int BIF_do_manipulator(bContext *C, const struct wmEvent *event, wmOperator *op)
 					constraint_axis[2] = 1;
 					break;
 			}
-			RNA_boolean_set_array(op->ptr, "constraint_axis", constraint_axis);
-			WM_operator_name_call(C, "TRANSFORM_OT_rotate", WM_OP_INVOKE_DEFAULT, op->ptr);
+			RNA_boolean_set_array(&props_ptr, "constraint_axis", constraint_axis);
+		}
+
+		/* pass operator properties on to transform operators */
+		prop = RNA_struct_find_property(op->ptr, "use_accurate");
+		if (RNA_property_is_set(op->ptr, prop)) {
+			RNA_property_boolean_set(&props_ptr, prop, RNA_property_boolean_get(op->ptr, prop));
+		}
+		prop = RNA_struct_find_property(op->ptr, "release_confirm");
+		if (RNA_property_is_set(op->ptr, prop)) {
+			RNA_property_boolean_set(&props_ptr, prop, RNA_property_boolean_get(op->ptr, prop));
 		}
+		prop = RNA_struct_find_property(op->ptr, "constraint_orientation");
+		if (RNA_property_is_set(op->ptr, prop)) {
+			RNA_property_enum_set(&props_ptr, prop, RNA_property_enum_get(op->ptr, prop));
+		}
+
+		WM_operator_name_call_ptr(C, ot, WM_OP_INVOKE_DEFAULT, &props_ptr);
+		WM_operator_properties_free(&props_ptr);
 	}
 	/* after transform, restore drawflags */
 	drawflags = 0xFFFF;
diff --git a/source/blender/editors/transform/transform_ops.c b/source/blender/editors/transform/transform_ops.c
index cbe58ddf586..2a97384cf7d 100644
--- a/source/blender/editors/transform/transform_ops.c
+++ b/source/blender/editors/transform/transform_ops.c
@@ -569,6 +569,9 @@ void Transform_Properties(struct wmOperatorType *ot, int flags)
 		// Add confirm method all the time. At the end because it's not really that important and should be hidden only in log, not in keymap edit
 		/*prop =*/ RNA_def_boolean(ot->srna, "release_confirm", 0, "Confirm on Release", "Always confirm operation when releasing button");
 		//RNA_def_property_flag(prop, PROP_HIDDEN);
+
+		prop = RNA_def_boolean(ot->srna, "use_accurate", 0, "Accurate", "Use accurate transformation");
+		RNA_def_property_flag(prop, PROP_HIDDEN);
 	}
 }
 
diff --git a/source/blender/editors/transform/transform_snap_object.c b/source/blender/editors/transform/transform_snap_object.c
index 7c9dc43dbe4..b33528b4149 100644
--- a/source/blender/editors/transform/transform_snap_object.c
+++ b/source/blender/editors/transform/transform_snap_object.c
@@ -87,6 +87,8 @@ typedef struct SnapObjectData {
 typedef struct SnapObjectData_Mesh {
 	SnapObjectData sd;
 	BVHTreeFromMesh *bvh_trees[3];
+	MPoly *mpoly;
+	bool poly_allocated;
 
 } SnapObjectData_Mesh;
 
@@ -934,9 +936,10 @@ static bool snapEmpty(
 			float tmp_co[3];
 			copy_v3_v3(tmp_co, obmat[3]);
 			if (test_projected_vert_dist(
-				        snapdata->depth_range, snapdata->mval, tmp_co,
-				        snapdata->pmat, snapdata->win_half, is_persp, &dist_px_sq,
-				        r_loc)) {
+			        snapdata->depth_range, snapdata->mval, tmp_co,
+			        snapdata->pmat, snapdata->win_half, is_persp, &dist_px_sq,
+			        r_loc))
+			{
 				*dist_px = sqrtf(dist_px_sq);
 				*ray_depth = depth_get(r_loc, snapdata->ray_start, snapdata->ray_dir);
 				retval = true;
@@ -1051,7 +1054,6 @@ static int dm_looptri_to_poly_index(DerivedMesh *dm, const MLoopTri *lt)
 static bool snapDerivedMesh(
         SnapObjectContext *sctx, SnapData *snapdata,
         Object *ob, DerivedMesh *dm, float obmat[4][4], const unsigned int ob_index,
-        bool do_bb,
         /* read/write args */
         float *ray_depth, float *dist_px,
         /* return args */
@@ -1112,39 +1114,30 @@ static bool snapDerivedMesh(
 	copy_v3_v3(ray_org_local, snapdata->ray_origin);
 	mul_m4_v3(imat, ray_org_local);
 
-	if (do_bb) {
-		BoundBox *bb = BKE_object_boundbox_get(ob);
-
-		if (bb) {
-			BoundBox bb_temp;
-
-			/* We cannot afford a bounding box with some null dimension, which may happen in some cases...
-			 * Threshold is rather high, but seems to be needed to get good behavior, see T46099. */
-			bb = BKE_boundbox_ensure_minimum_dimensions(bb, &bb_temp, 1e-1f);
-
-			/* In vertex and edges you need to get the pixel distance from ray to BoundBox, see T46816. */
-			if (ELEM(snapdata->snap_to, SCE_SNAP_MODE_VERTEX, SCE_SNAP_MODE_EDGE)) {
-				float dist_px_sq = dist_squared_to_projected_aabb_simple(
-					    lpmat, snapdata->win_half, ray_min_dist, snapdata->mval,
-					    ray_org_local, ray_normal_local, bb->vec[0], bb->vec[6]);
-				if (dist_px_sq > SQUARE(*dist_px))
-				{
-					return retval;
-				}
+	/* Test BoundBox */
+	BoundBox *bb = BKE_object_boundbox_get(ob);
+	if (bb) {
+		/* In vertex and edges you need to get the pixel distance from ray to BoundBox, see: T46099, T46816 */
+		if (ELEM(snapdata->snap_to, SCE_SNAP_MODE_VERTEX, SCE_SNAP_MODE_EDGE)) {
+			float dist_px_sq = dist_squared_to_projected_aabb_simple(
+			        lpmat, snapdata->win_half, ray_min_dist, snapdata->mval,
+			        ray_org_local, ray_normal_local, bb->vec[0], bb->vec[6]);
+			if (dist_px_sq > SQUARE(*dist_px)) {
+				return retval;
 			}
-			else {
-				/* was BKE_boundbox_ray_hit_check, see: cf6ca226fa58 */
-				if (!isect_ray_aabb_v3_simple(
-					ray_start_local, ray_normal_local, bb->vec[0], bb->vec[6], NULL, NULL))
-				{
-					return retval;
-				}
+		}
+		else {
+			/* was BKE_boundbox_ray_hit_check, see: cf6ca226fa58 */
+			if (!isect_ray_aabb_v3_simple(
+			        ray_start_local, ray_normal_local, bb->vec[0], bb->vec[6], NULL, NULL))
+			{
+				return retval;
 			}
-			/* was local_depth, see: T47838 */
-			len_diff = dist_aabb_to_plane(bb->vec[0], bb->vec[6], ray_start_local, ray_normal_local);
-			if (len_diff < 0) len_diff = 0.0f;
-			need_ray_start_correction_init = false;
 		}
+		/* was local_depth, see: T47838 */
+		len_diff = dist_aabb_to_plane(bb->vec[0], bb->vec[6], ray_start_local, ray_normal_local);
+		if (len_diff < 0) len_diff = 0.0f;
+		need_ray_start_correction_init = false;
 	}
 
 	SnapObjectData_Mesh *sod = NULL;
@@ -1182,6 +1175,29 @@ static bool snapDerivedMesh(
 			if (treedata->cached && !bvhcache_has_tree(dm->bvhCache, treedata->tree)) {
 				free_bvhtree_from_mesh(treedata);
 			}
+			else {
+				if (!treedata->vert_allocated) {
+					treedata->vert = DM_get_vert_array(dm, &treedata->vert_allocated);
+				}
+				if ((tree_index == 1) && !treedata->edge_allocated) {
+					treedata->edge = DM_get_edge_array(dm, &treedata->vert_allocated);
+				}
+				if (tree_index == 2) {
+					if (!treedata->loop_allocated) {
+						treedata->loop = DM_get_loop_array(dm, &treedata->loop_allocated);
+					}
+					if (!treedata->looptri_allocated) {
+						if (!sod->poly_allocated) {
+							sod->mpoly = DM_get_poly_array(dm, &sod->poly_allocated);
+						}
+						treedata->looptri = DM_get_looptri_array(
+						        dm, treedata->vert,
+						        sod->mpoly, dm->getNumPolys(dm),
+						        treedata->loop, dm->getNumLoops(dm),
+						        &treedata->looptri_allocated);
+					}
+				}
+			}
 		}
 	}
 
@@ -1295,10 +1311,17 @@ static bool snapDerivedMesh(
 	}
 	/* SCE_SNAP_MODE_VERTEX or SCE_SNAP_MODE_EDGE */
 	else {
+
+		/* Warning: the depth_max is currently being used only in perspective view.
+		 * It is not correct to limit the maximum depth for elements obtained with nearest
+		 * since this limitation depends on the normal and the size of the occlusion face.
+		 * And more... ray_depth is being confused with Z-depth here... (varies only the precision) */
+		const float ray_depth_max_global = *ray_depth + snapdata->depth_range[0];
+
 		Nearest2dUserData neasrest2d = {
 		    .dist_px_sq = SQUARE(*dist_px),
 		    .r_axis_closest = {1.0f, 1.0f, 1.0f},
-		    .depth_range = {snapdata->depth_range[0], *ray_depth + snapdata->depth_range[0]},
+		    .depth_range = {snapdata->depth_range[0], ray_depth_max_global},
 		    .userdata = treedata,
 		    .get_edge_verts = (Nearest2DGetEdgeVertsCallback)get_dm_edge_verts,
 		    .copy_vert_no = (Nearest2DCopyVertNoCallback)copy_dm_vert_no,
@@ -1650,7 +1673,6 @@ static bool snapObject(
 			}
 			retval = snapDerivedMesh(
 			        sctx, snapdata, ob, dm, obmat, ob_index,
-			        true,
 			        ray_depth, dist_px,
 			        r_loc, r_no,
 			        r_index, r_hit_list);
@@ -1858,6 +1880,9 @@ static void snap_object_data_free(void *sod_v)
 					free_bvhtree_from_mesh(sod->bvh_trees[i]);
 				}
 			}
+			if (sod->poly_allocated) {
+				MEM_freeN(sod->mpoly);
+			}
 			break;
 		}
 		case SNAP_EDIT_MESH:
@@ -2053,10 +2078,10 @@ static bool transform_snap_context_project_view3d_mixed_impl(
 		for (int i = 0; i < 3; i++) {
 			if (snap_to_flag & (1 << i)) {
 				if (ED_transform_snap_object_project_view3d(
-					sctx,
-					elem_type[i], params,
-					mval, dist_px, &ray_depth,
-					r_co, r_no))
+				        sctx,
+				        elem_type[i], params,
+				        mval, dist_px, &ray_depth,
+				        r_co, r_no))
 				{
 					is_hit = true;
 					break;
diff --git a/source/blender/editors/util/numinput.c b/source/blender/editors/util/numinput.c
index e07831358d6..3727be1842c 100644
--- a/source/blender/editors/util/numinput.c
+++ b/source/blender/editors/util/numinput.c
@@ -497,7 +497,7 @@ bool handleNumInput(bContext *C, NumInput *n, const wmEvent *event)
 		                    n->unit_sys, n->unit_type[idx]);
 
 		/* Note: with angles, we always get values as radians here... */
-		if (BPY_execute_string_as_number(C, str_unit_convert, &val, false)) {
+		if (BPY_execute_string_as_number(C, str_unit_convert, false, &val)) {
 			n->val[idx] = (float)val;
 			n->val_flag[idx] &= ~NUM_INVALID;
 		}
diff --git a/source/blender/editors/util/undo.c b/source/blender/editors/util/undo.c
index fab5b7e821f..4e021d4833e 100644
--- a/source/blender/editors/util/undo.c
+++ b/source/blender/editors/util/undo.c
@@ -419,6 +419,9 @@ int ED_undo_operator_repeat(bContext *C, struct wmOperator *op)
 
 			if (G.debug & G_DEBUG)
 				printf("redo_cb: operator redo %s\n", op->type->name);
+
+			WM_operator_free_all_after(wm, op);
+
 			ED_undo_pop_op(C, op);
 
 			if (op->type->check) {
diff --git a/source/blender/gpu/CMakeLists.txt b/source/blender/gpu/CMakeLists.txt
index 8885209ce01..885ff2ff159 100644
--- a/source/blender/gpu/CMakeLists.txt
+++ b/source/blender/gpu/CMakeLists.txt
@@ -57,6 +57,8 @@ set(SRC
 	intern/gpu_init_exit.c
 	intern/gpu_material.c
 	intern/gpu_select.c
+	intern/gpu_select_pick.c
+	intern/gpu_select_sample_query.c
 	intern/gpu_shader.c
 	intern/gpu_texture.c
 
@@ -97,6 +99,7 @@ set(SRC
 	GPU_texture.h
 	intern/gpu_codegen.h
 	intern/gpu_private.h
+	intern/gpu_select_private.h
 )
 
 data_to_c_simple(shaders/gpu_shader_geometry.glsl SRC)
diff --git a/source/blender/gpu/GPU_select.h b/source/blender/gpu/GPU_select.h
index 6a16b5b7456..cf5b8bf7d8f 100644
--- a/source/blender/gpu/GPU_select.h
+++ b/source/blender/gpu/GPU_select.h
@@ -30,19 +30,30 @@
 #ifndef __GPU_SELECT_H__
 #define __GPU_SELECT_H__
 
-#include "DNA_vec_types.h"  /* rcft */
 #include "BLI_sys_types.h"
 
+struct rcti;
+
 /* flags for mode of operation */
 enum {
 	GPU_SELECT_ALL                      = 1,
+	/* gpu_select_query */
 	GPU_SELECT_NEAREST_FIRST_PASS       = 2,
 	GPU_SELECT_NEAREST_SECOND_PASS      = 3,
+	/* gpu_select_pick */
+	GPU_SELECT_PICK_ALL           = 4,
+	GPU_SELECT_PICK_NEAREST       = 5,
 };
 
-void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, rctf *input, char mode, int oldhits);
+void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const struct rcti *input, char mode, int oldhits);
 bool GPU_select_load_id(unsigned int id);
 unsigned int GPU_select_end(void);
 bool GPU_select_query_check_active(void);
 
+/* cache selection region */
+bool GPU_select_is_cached(void);
+void GPU_select_cache_begin(void);
+void GPU_select_cache_load_id(void);
+void GPU_select_cache_end(void);
+
 #endif
diff --git a/source/blender/gpu/intern/gpu_codegen.c b/source/blender/gpu/intern/gpu_codegen.c
index 211394e7932..c3896fbd659 100644
--- a/source/blender/gpu/intern/gpu_codegen.c
+++ b/source/blender/gpu/intern/gpu_codegen.c
@@ -360,7 +360,7 @@ static void codegen_print_datatype(DynStr *ds, const GPUType type, float *data)
 	BLI_dynstr_appendf(ds, "%s(", GPU_DATATYPE_STR[type]);
 
 	for (i = 0; i < type; i++) {
-		BLI_dynstr_appendf(ds, "%f", data[i]);
+		BLI_dynstr_appendf(ds, "%.12f", data[i]);
 		if (i == type - 1)
 			BLI_dynstr_append(ds, ")");
 		else
diff --git a/source/blender/gpu/intern/gpu_debug.c b/source/blender/gpu/intern/gpu_debug.c
index d632e767ca9..ba68d1a6a0f 100644
--- a/source/blender/gpu/intern/gpu_debug.c
+++ b/source/blender/gpu/intern/gpu_debug.c
@@ -161,7 +161,7 @@ const char *gpuErrorString(GLenum err)
 #endif
 
 
-static const char* source_name(GLenum source)
+static const char *source_name(GLenum source)
 {
 	switch (source) {
 		case GL_DEBUG_SOURCE_API: return "API";
@@ -174,7 +174,7 @@ static const char* source_name(GLenum source)
 	}
 }
 
-static const char* message_type_name(GLenum message)
+static const char *message_type_name(GLenum message)
 {
 	switch (message) {
 		case GL_DEBUG_TYPE_ERROR: return "error";
@@ -188,7 +188,7 @@ static const char* message_type_name(GLenum message)
 	}
 }
 
-static const char* category_name_amd(GLenum category)
+static const char *category_name_amd(GLenum category)
 {
 	switch (category) {
 		case GL_DEBUG_CATEGORY_API_ERROR_AMD: return "API error";
diff --git a/source/blender/gpu/intern/gpu_select.c b/source/blender/gpu/intern/gpu_select.c
index 58582232cd5..9496ff137dc 100644
--- a/source/blender/gpu/intern/gpu_select.c
+++ b/source/blender/gpu/intern/gpu_select.c
@@ -29,109 +29,86 @@
  * Interface for accessing gpu-related methods for selection. The semantics will be
  * similar to glRenderMode(GL_SELECT) since the goal is to maintain compatibility.
  */
+#include <stdlib.h>
+
 #include "GPU_select.h"
 #include "GPU_extensions.h"
 #include "GPU_glew.h"
- 
+
 #include "MEM_guardedalloc.h"
 
 #include "DNA_userdef_types.h"
 
 #include "BLI_utildefines.h"
 
-/* Ad hoc number of queries to allocate to skip doing many glGenQueries */
-#define ALLOC_QUERIES 200
-
-typedef struct GPUQueryState {
+#include "gpu_select_private.h"
+
+/* Internal algorithm used */
+enum {
+	/** GL_SELECT, legacy OpenGL selection */
+	ALGO_GL_LEGACY = 1,
+	/** glBegin/EndQuery(GL_SAMPLES_PASSED... ), `gpu_select_query.c`
+	 * Only sets 4th component (ID) correctly. */
+	ALGO_GL_QUERY = 2,
+	/** Read depth buffer for every drawing pass and extract depths, `gpu_select_pick.c`
+	 * Only sets 4th component (ID) correctly. */
+	ALGO_GL_PICK = 3,
+};
+
+typedef struct GPUSelectState {
 	/* To ignore selection id calls when not initialized */
 	bool select_is_active;
-	/* Tracks whether a query has been issued so that gpu_load_id can end the previous one */
-	bool query_issued;
-	/* array holding the OpenGL query identifiers */
-	unsigned int *queries;
-	/* array holding the id corresponding to each query */
-	unsigned int *id;
-	/* number of queries in *queries and *id */
-	unsigned int num_of_queries;
-	/* index to the next query to start */
-	unsigned int active_query;
 	/* flag to cache user preference for occlusion based selection */
 	bool use_gpu_select;
-	/* cache on initialization */
-	unsigned int *buffer;
-	/* buffer size (stores number of integers, for actual size multiply by sizeof integer)*/
-	unsigned int bufsize;
 	/* mode of operation */
 	char mode;
-	unsigned int index;
-	int oldhits;
-} GPUQueryState;
+	/* internal algorithm for selection */
+	char algorithm;
+	/* allow GPU_select_begin/end without drawing */
+	bool use_cache;
+} GPUSelectState;
 
-static GPUQueryState g_query_state = {0};
+static GPUSelectState g_select_state = {0};
 
 /**
  * initialize and provide buffer for results
  */
-void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, rctf *input, char mode, int oldhits)
+void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, const rcti *input, char mode, int oldhits)
 {
-	g_query_state.select_is_active = true;
-	g_query_state.query_issued = false;
-	g_query_state.active_query = 0;
-	g_query_state.use_gpu_select = GPU_select_query_check_active();
-	g_query_state.num_of_queries = 0;
-	g_query_state.bufsize = bufsize;
-	g_query_state.buffer = buffer;
-	g_query_state.mode = mode;
-	g_query_state.index = 0;
-	g_query_state.oldhits = oldhits;
+	g_select_state.select_is_active = true;
+	g_select_state.use_gpu_select = GPU_select_query_check_active();
+	g_select_state.mode = mode;
 
-	if (!g_query_state.use_gpu_select) {
-		glSelectBuffer(bufsize, (GLuint *)buffer);
-		glRenderMode(GL_SELECT);
-		glInitNames();
-		glPushName(-1);
+	if (ELEM(g_select_state.mode, GPU_SELECT_PICK_ALL, GPU_SELECT_PICK_NEAREST)) {
+		g_select_state.algorithm = ALGO_GL_PICK;
+	}
+	else if (!g_select_state.use_gpu_select) {
+		g_select_state.algorithm = ALGO_GL_LEGACY;
 	}
 	else {
-		float viewport[4];
-
-		g_query_state.num_of_queries = ALLOC_QUERIES;
-
-		g_query_state.queries = MEM_mallocN(g_query_state.num_of_queries * sizeof(*g_query_state.queries), "gpu selection queries");
-		g_query_state.id = MEM_mallocN(g_query_state.num_of_queries * sizeof(*g_query_state.id), "gpu selection ids");
-		glGenQueries(g_query_state.num_of_queries, g_query_state.queries);
-
-		glPushAttrib(GL_DEPTH_BUFFER_BIT | GL_VIEWPORT_BIT);
-		/* disable writing to the framebuffer */
-		glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
-
-		/* In order to save some fill rate we minimize the viewport using rect.
-		 * We need to get the region of the scissor so that our geometry doesn't
-		 * get rejected before the depth test. Should probably cull rect against
-		 * scissor for viewport but this is a rare case I think */
-		glGetFloatv(GL_SCISSOR_BOX, viewport);
-		if (!input || input->xmin == input->xmax) {
-			glViewport(viewport[0], viewport[1], 24, 24);
-		}
-		else {
-			glViewport(viewport[0], viewport[1], (int)(input->xmax - input->xmin), (int)(input->ymax - input->ymin));
-		}
+		g_select_state.algorithm = ALGO_GL_QUERY;
+	}
 
-		/* occlusion queries operates on fragments that pass tests and since we are interested on all
-		 * objects in the view frustum independently of their order, we need to disable the depth test */
-		if (mode == GPU_SELECT_ALL) {
-			glDisable(GL_DEPTH_TEST);
-			glDepthMask(GL_FALSE);
+	switch (g_select_state.algorithm) {
+		case ALGO_GL_LEGACY:
+		{
+			g_select_state.use_cache = false;
+			glSelectBuffer(bufsize, (GLuint *)buffer);
+			glRenderMode(GL_SELECT);
+			glInitNames();
+			glPushName(-1);
+			break;
 		}
-		else if (mode == GPU_SELECT_NEAREST_FIRST_PASS) {
-			glClear(GL_DEPTH_BUFFER_BIT);
-			glEnable(GL_DEPTH_TEST);
-			glDepthMask(GL_TRUE);
-			glDepthFunc(GL_LEQUAL);
+		case ALGO_GL_QUERY:
+		{
+			g_select_state.use_cache = false;
+			gpu_select_query_begin((unsigned int (*)[4])buffer, bufsize / 4, input, mode, oldhits);
+			break;
 		}
-		else if (mode == GPU_SELECT_NEAREST_SECOND_PASS) {
-			glEnable(GL_DEPTH_TEST);
-			glDepthMask(GL_FALSE);
-			glDepthFunc(GL_EQUAL);
+		default:  /* ALGO_GL_PICK */
+		{
+			gpu_select_pick_begin((unsigned int (*)[4])buffer, bufsize / 4, input, mode);
+			break;
 		}
 	}
 }
@@ -146,41 +123,24 @@ void GPU_select_begin(unsigned int *buffer, unsigned int bufsize, rctf *input, c
 bool GPU_select_load_id(unsigned int id)
 {
 	/* if no selection mode active, ignore */
-	if (!g_query_state.select_is_active)
+	if (!g_select_state.select_is_active)
 		return true;
 
-	if (!g_query_state.use_gpu_select) {
-		glLoadName(id);
-	}
-	else {
-		if (g_query_state.query_issued) {
-			glEndQuery(GL_SAMPLES_PASSED);
+	switch (g_select_state.algorithm) {
+		case ALGO_GL_LEGACY:
+		{
+			glLoadName(id);
+			return true;
 		}
-		/* if required, allocate extra queries */
-		if (g_query_state.active_query == g_query_state.num_of_queries) {
-			g_query_state.num_of_queries += ALLOC_QUERIES;
-			g_query_state.queries = MEM_reallocN(g_query_state.queries, g_query_state.num_of_queries * sizeof(*g_query_state.queries));
-			g_query_state.id = MEM_reallocN(g_query_state.id, g_query_state.num_of_queries * sizeof(*g_query_state.id));
-			glGenQueries(ALLOC_QUERIES, &g_query_state.queries[g_query_state.active_query]);
+		case ALGO_GL_QUERY:
+		{
+			return gpu_select_query_load_id(id);
 		}
-
-		glBeginQuery(GL_SAMPLES_PASSED, g_query_state.queries[g_query_state.active_query]);
-		g_query_state.id[g_query_state.active_query] = id;
-		g_query_state.active_query++;
-		g_query_state.query_issued = true;
-
-		if (g_query_state.mode == GPU_SELECT_NEAREST_SECOND_PASS && g_query_state.index < g_query_state.oldhits) {
-			if (g_query_state.buffer[g_query_state.index * 4 + 3] == id) {
-				g_query_state.index++;
-				return true;
-			}
-			else {
-				return false;
-			}
+		default:  /* ALGO_GL_PICK */
+		{
+			return gpu_select_pick_load_id(id);
 		}
 	}
-
-	return true;
 }
 
 /**
@@ -191,59 +151,27 @@ bool GPU_select_load_id(unsigned int id)
 unsigned int GPU_select_end(void)
 {
 	unsigned int hits = 0;
-	if (!g_query_state.use_gpu_select) {
-		glPopName();
-		hits = glRenderMode(GL_RENDER);
-	}
-	else {
-		int i;
 
-		if (g_query_state.query_issued) {
-			glEndQuery(GL_SAMPLES_PASSED);
+	switch (g_select_state.algorithm) {
+		case ALGO_GL_LEGACY:
+		{
+			glPopName();
+			hits = glRenderMode(GL_RENDER);
+			break;
 		}
-
-		for (i = 0; i < g_query_state.active_query; i++) {
-			unsigned int result;
-			glGetQueryObjectuiv(g_query_state.queries[i], GL_QUERY_RESULT, &result);
-			if (result > 0) {
-				if (g_query_state.mode != GPU_SELECT_NEAREST_SECOND_PASS) {
-					int maxhits = g_query_state.bufsize / 4;
-
-					if (hits < maxhits) {
-						g_query_state.buffer[hits * 4] = 1;
-						g_query_state.buffer[hits * 4 + 1] = 0xFFFF;
-						g_query_state.buffer[hits * 4 + 2] = 0xFFFF;
-						g_query_state.buffer[hits * 4 + 3] = g_query_state.id[i];
-
-						hits++;
-					}
-					else {
-						hits = -1;
-						break;
-					}
-				}
-				else {
-					int j;
-					/* search in buffer and make selected object first */
-					for (j = 0; j < g_query_state.oldhits; j++) {
-						if (g_query_state.buffer[j * 4 + 3] == g_query_state.id[i]) {
-							g_query_state.buffer[j * 4 + 1] = 0;
-							g_query_state.buffer[j * 4 + 2] = 0;
-						}
-					}
-					break;
-				}
-			}
+		case ALGO_GL_QUERY:
+		{
+			hits = gpu_select_query_end();
+			break;
+		}
+		default:  /* ALGO_GL_PICK */
+		{
+			hits = gpu_select_pick_end();
+			break;
 		}
-
-		glDeleteQueries(g_query_state.num_of_queries, g_query_state.queries);
-		MEM_freeN(g_query_state.queries);
-		MEM_freeN(g_query_state.id);
-		glPopAttrib();
-		glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
 	}
 
-	g_query_state.select_is_active = false;
+	g_select_state.select_is_active = false;
 
 	return hits;
 }
@@ -260,3 +188,41 @@ bool GPU_select_query_check_active(void)
 	          GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_UNIX, GPU_DRIVER_OPENSOURCE))));
 
 }
+
+/* ----------------------------------------------------------------------------
+ * Caching
+ *
+ * Support multiple begin/end's as long as they are within the initial region.
+ * Currently only used by ALGO_GL_PICK.
+ */
+
+void GPU_select_cache_begin(void)
+{
+	/* validate on GPU_select_begin, clear if not supported */
+	BLI_assert(g_select_state.use_cache == false);
+	g_select_state.use_cache = true;
+	if (g_select_state.algorithm == ALGO_GL_PICK) {
+		gpu_select_pick_cache_begin();
+	}
+}
+
+void GPU_select_cache_load_id(void)
+{
+	BLI_assert(g_select_state.use_cache == true);
+	if (g_select_state.algorithm == ALGO_GL_PICK) {
+		gpu_select_pick_cache_load_id();
+	}
+}
+
+void GPU_select_cache_end(void)
+{
+	if (g_select_state.algorithm == ALGO_GL_PICK) {
+		gpu_select_pick_cache_end();
+	}
+	g_select_state.use_cache = false;
+}
+
+bool GPU_select_is_cached(void)
+{
+	return g_select_state.use_cache && gpu_select_pick_is_cached();
+}
diff --git a/source/blender/gpu/intern/gpu_select_pick.c b/source/blender/gpu/intern/gpu_select_pick.c
new file mode 100644
index 00000000000..0a77420fa25
--- /dev/null
+++ b/source/blender/gpu/intern/gpu_select_pick.c
@@ -0,0 +1,744 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2017 Blender Foundation.
+ * All rights reserved.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/gpu/intern/gpu_select_pick.c
+ *  \ingroup gpu
+ *
+ * Custom select code for picking small regions (not efficient for large regions).
+ * `gpu_select_pick_*` API.
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <float.h>
+
+#include "GPU_select.h"
+#include "GPU_extensions.h"
+#include "GPU_glew.h"
+ 
+#include "MEM_guardedalloc.h"
+
+#include "BLI_rect.h"
+#include "BLI_listbase.h"
+#include "BLI_math_vector.h"
+#include "BLI_utildefines.h"
+
+#include "gpu_select_private.h"
+
+#include "BLI_strict_flags.h"
+
+/* #define DEBUG_PRINT */
+
+/* Alloc number for depths */
+#define ALLOC_DEPTHS 200
+
+/* Z-depth of cleared depth buffer */
+#define DEPTH_MAX 0xffffffff
+
+/* ----------------------------------------------------------------------------
+ * SubRectStride
+ */
+
+/* For looping over a sub-region of a rect, could be moved into 'rct.c'*/
+typedef struct SubRectStride {
+	unsigned int start;     /* start here */
+	unsigned int span;      /* read these */
+	unsigned int span_len;  /* len times (read span 'len' times). */
+	unsigned int skip;      /* skip those */
+} SubRectStride;
+
+/* we may want to change back to float if uint isn't well supported */
+typedef unsigned int depth_t;
+
+/**
+ * Calculate values needed for looping over a sub-region (smaller buffer within a larger buffer).
+ *
+ * 'src' must be bigger than 'dst'.
+ */
+static void rect_subregion_stride_calc(const rcti *src, const rcti *dst, SubRectStride *r_sub)
+{
+	const int src_x = BLI_rcti_size_x(src);
+	// const int src_y = BLI_rcti_size_y(src);
+	const int dst_x = BLI_rcti_size_x(dst);
+	const int dst_y = BLI_rcti_size_y(dst);
+	const int x = dst->xmin - src->xmin;
+	const int y = dst->ymin - src->ymin;
+
+	BLI_assert(src->xmin <= dst->xmin && src->ymin <= dst->ymin &&
+	           src->ymax >= dst->ymax && src->ymax >= dst->ymax);
+	BLI_assert(x >= 0 && y >= 0);
+
+	r_sub->start    = (unsigned int)((src_x * y) + x);
+	r_sub->span     = (unsigned int)dst_x;
+	r_sub->span_len = (unsigned int)dst_y;
+	r_sub->skip     = (unsigned int)(src_x - dst_x);
+}
+
+/**
+ * Ignore depth clearing as a change,
+ * only check if its been changed _and_ filled in (ignore clearing since XRAY does this).
+ */
+BLI_INLINE bool depth_is_filled(const depth_t *prev, const depth_t *curr)
+{
+	return (*prev != *curr) && (*curr != DEPTH_MAX);
+}
+
+/* ----------------------------------------------------------------------------
+ * DepthBufCache
+ *
+ * Result of reading glReadPixels,
+ * use for both cache and non-cached storage.
+ */
+
+/* store result of glReadPixels */
+typedef struct DepthBufCache {
+	struct DepthBufCache *next, *prev;
+	unsigned int id;
+	depth_t buf[0];
+} DepthBufCache;
+
+static DepthBufCache *depth_buf_malloc(unsigned int rect_len)
+{
+	DepthBufCache *rect = MEM_mallocN(sizeof(DepthBufCache) + sizeof(depth_t) * rect_len, __func__);
+	rect->id = SELECT_ID_NONE;
+	return rect;
+}
+
+static bool depth_buf_rect_depth_any(
+        const DepthBufCache *rect_depth,
+        unsigned int rect_len)
+{
+	const depth_t *curr = rect_depth->buf;
+	for (unsigned int i = 0; i < rect_len; i++, curr++) {
+		if (*curr != DEPTH_MAX) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static bool depth_buf_subrect_depth_any(
+        const DepthBufCache *rect_depth,
+        const SubRectStride *sub_rect)
+{
+	const depth_t *curr = rect_depth->buf + sub_rect->start;
+	for (unsigned int i = 0; i < sub_rect->span_len; i++) {
+		const depth_t *curr_end = curr + sub_rect->span;
+		for (; curr < curr_end; curr++, curr++) {
+			if (*curr != DEPTH_MAX) {
+				return true;
+			}
+		}
+		curr += sub_rect->skip;
+	}
+	return false;
+}
+
+static bool depth_buf_rect_depth_any_filled(
+        const DepthBufCache *rect_prev, const DepthBufCache *rect_curr,
+        unsigned int rect_len)
+{
+#if 0
+	return memcmp(rect_depth_a->buf, rect_depth_b->buf, rect_len * sizeof(depth_t)) != 0;
+#else
+	const depth_t *prev = rect_prev->buf;
+	const depth_t *curr = rect_curr->buf;
+	for (unsigned int i = 0; i < rect_len; i++, curr++, prev++) {
+		if (depth_is_filled(prev, curr)) {
+			return true;
+		}
+	}
+	return false;
+#endif
+}
+
+/**
+ * Both buffers are the same size, just check if the sub-rect contains any differences.
+ */
+static bool depth_buf_subrect_depth_any_filled(
+        const DepthBufCache *rect_src, const DepthBufCache *rect_dst,
+        const SubRectStride *sub_rect)
+{
+	/* same as above but different rect sizes */
+	const depth_t *prev = rect_src->buf + sub_rect->start;
+	const depth_t *curr = rect_dst->buf + sub_rect->start;
+	for (unsigned int i = 0; i < sub_rect->span_len; i++) {
+		const depth_t *curr_end = curr + sub_rect->span;
+		for (; curr < curr_end; prev++, curr++) {
+			if (depth_is_filled(prev, curr)) {
+				return true;
+			}
+		}
+		prev += sub_rect->skip;
+		curr += sub_rect->skip;
+	}
+	return false;
+}
+
+/* ----------------------------------------------------------------------------
+ * DepthID
+ *
+ * Internal structure for storing hits.
+ */
+
+typedef struct DepthID {
+	unsigned int id;
+	depth_t depth;
+} DepthID;
+
+static int depth_id_cmp(const void *v1, const void *v2)
+{
+	const DepthID *d1 = v1, *d2 = v2;
+	if (d1->id < d2->id) {
+		return -1;
+	}
+	else if (d1->id > d2->id) {
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}
+
+static int depth_cmp(const void *v1, const void *v2)
+{
+	const DepthID *d1 = v1, *d2 = v2;
+	if (d1->depth < d2->depth) {
+		return -1;
+	}
+	else if (d1->depth > d2->depth) {
+		return 1;
+	}
+	else {
+		return 0;
+	}
+}
+
+/* depth sorting */
+typedef struct GPUPickState {
+	/* cache on initialization */
+	unsigned int (*buffer)[4];
+
+	/* buffer size (stores number of integers, for actual size multiply by sizeof integer)*/
+	unsigned int bufsize;
+	/* mode of operation */
+	char mode;
+
+	/* OpenGL drawing, never use when (is_cached == true). */
+	struct {
+		/* The current depth, accumulated as we draw */
+		DepthBufCache *rect_depth;
+		/* Scratch buffer, avoid allocs every time (when not caching) */
+		DepthBufCache *rect_depth_test;
+
+		/* Pass to glReadPixels (x, y, w, h) */
+		int clip_readpixels[4];
+
+		/* Set after first draw */
+		bool is_init;
+		unsigned int prev_id;
+	} gl;
+
+	/* src: data stored in 'cache' and 'gl',
+	 * dst: use when cached region is smaller (where src -> dst isn't 1:1) */
+	struct {
+		rcti clip_rect;
+		unsigned int rect_len;
+	} src, dst;
+
+	/* Store cache between `GPU_select_cache_begin/end` */
+	bool use_cache;
+	bool is_cached;
+	struct {
+		/* Cleanup used for iterating over both source and destination buffers:
+		 * src.clip_rect -> dst.clip_rect */
+		SubRectStride sub_rect;
+
+		/* List of DepthBufCache, sized of 'src.clip_rect' */
+		ListBase bufs;
+	} cache;
+
+	/* Pickign methods */
+	union {
+		/* GPU_SELECT_PICK_ALL */
+		struct {
+			DepthID *hits;
+			unsigned int hits_len;
+			unsigned int hits_len_alloc;
+		} all;
+
+		/* GPU_SELECT_PICK_NEAREST */
+		struct {
+			unsigned int *rect_id;
+		} nearest;
+	};
+} GPUPickState;
+
+
+static GPUPickState g_pick_state = {0};
+
+void gpu_select_pick_begin(
+        unsigned int (*buffer)[4], unsigned int bufsize,
+        const rcti *input, char mode)
+{
+	GPUPickState *ps = &g_pick_state;
+
+#ifdef DEBUG_PRINT
+	printf("%s: mode=%d, use_cache=%d, is_cache=%d\n", __func__, mode, ps->use_cache, ps->is_cached);
+#endif
+
+	ps->bufsize = bufsize;
+	ps->buffer = buffer;
+	ps->mode = mode;
+
+	const unsigned int rect_len = (unsigned int)(BLI_rcti_size_x(input) * BLI_rcti_size_y(input));
+	ps->dst.clip_rect = *input;
+	ps->dst.rect_len = rect_len;
+
+	/* Restrict OpenGL operations for when we don't have cache */
+	if (ps->is_cached == false) {
+
+		glPushAttrib(GL_DEPTH_BUFFER_BIT | GL_VIEWPORT_BIT);
+		/* disable writing to the framebuffer */
+		glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+
+		glEnable(GL_DEPTH_TEST);
+		glDepthMask(GL_TRUE);
+
+		if (mode == GPU_SELECT_PICK_ALL) {
+			glDepthFunc(GL_ALWAYS);
+		}
+		else {
+			glDepthFunc(GL_LEQUAL);
+		}
+
+		/* set just in case */
+		glPixelTransferf(GL_DEPTH_BIAS, 0.0);
+		glPixelTransferf(GL_DEPTH_SCALE, 1.0);
+
+		float viewport[4];
+		glGetFloatv(GL_SCISSOR_BOX, viewport);
+
+		ps->src.clip_rect = *input;
+		ps->src.rect_len = rect_len;
+
+		ps->gl.clip_readpixels[0] = (int)viewport[0];
+		ps->gl.clip_readpixels[1] = (int)viewport[1];
+		ps->gl.clip_readpixels[2] = BLI_rcti_size_x(&ps->src.clip_rect);
+		ps->gl.clip_readpixels[3] = BLI_rcti_size_y(&ps->src.clip_rect);
+
+		glViewport(UNPACK4(ps->gl.clip_readpixels));
+
+		/* It's possible we don't want to clear depth buffer,
+		 * so existing elements are masked by current z-buffer. */
+		glClear(GL_DEPTH_BUFFER_BIT);
+
+		/* scratch buffer (read new values here) */
+		ps->gl.rect_depth_test = depth_buf_malloc(rect_len);
+		ps->gl.rect_depth = depth_buf_malloc(rect_len);
+
+		/* set initial 'far' value */
+#if 0
+		glReadPixels(UNPACK4(ps->gl.clip_readpixels), GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, ps->gl.rect_depth->buf);
+#else
+		for (unsigned int i = 0; i < rect_len; i++) {
+			ps->gl.rect_depth->buf[i] = DEPTH_MAX;
+		}
+#endif
+
+		ps->gl.is_init = false;
+		ps->gl.prev_id = 0;
+	}
+	else {
+		/* Using cache (ps->is_cached == true) */
+		/* src.clip_rect -> dst.clip_rect */
+		rect_subregion_stride_calc(&ps->src.clip_rect, &ps->dst.clip_rect, &ps->cache.sub_rect);
+		BLI_assert(ps->gl.rect_depth == NULL);
+		BLI_assert(ps->gl.rect_depth_test == NULL);
+	}
+
+	if (mode == GPU_SELECT_PICK_ALL) {
+		ps->all.hits = MEM_mallocN(sizeof(*ps->all.hits) * ALLOC_DEPTHS, __func__);
+		ps->all.hits_len = 0;
+		ps->all.hits_len_alloc = ALLOC_DEPTHS;
+	}
+	else {
+		/* Set to 0xff for SELECT_ID_NONE */
+		ps->nearest.rect_id = MEM_mallocN(sizeof(unsigned int) * ps->dst.rect_len, __func__);
+		memset(ps->nearest.rect_id, 0xff, sizeof(unsigned int) * ps->dst.rect_len);
+	}
+}
+
+/**
+ * Given 2x depths, we know are different - update the depth information
+ * use for both cached/uncached depth buffers.
+ */
+static void gpu_select_load_id_pass_all(const DepthBufCache *rect_curr)
+{
+	GPUPickState *ps = &g_pick_state;
+	const unsigned int id = rect_curr->id;
+	/* find the best depth for this pass and store in 'all.hits' */
+	depth_t depth_best = DEPTH_MAX;
+
+#define EVAL_TEST() \
+	if (depth_best > *curr) { \
+		depth_best = *curr; \
+	} ((void)0)
+
+	if (ps->is_cached == false) {
+		const depth_t *curr = rect_curr->buf;
+		BLI_assert(ps->src.rect_len == ps->dst.rect_len);
+		const unsigned int rect_len = ps->src.rect_len;
+		for (unsigned int i = 0; i < rect_len; i++, curr++) {
+			EVAL_TEST();
+		}
+	}
+	else {
+		/* same as above but different rect sizes */
+		const depth_t *curr = rect_curr->buf + ps->cache.sub_rect.start;
+		for (unsigned int i = 0; i < ps->cache.sub_rect.span_len; i++) {
+			const depth_t *curr_end = curr + ps->cache.sub_rect.span;
+			for (; curr < curr_end; curr++) {
+				EVAL_TEST();
+			}
+			curr += ps->cache.sub_rect.skip;
+		}
+	}
+
+#undef EVAL_TEST
+
+	/* ensure enough space */
+	if (UNLIKELY(ps->all.hits_len == ps->all.hits_len_alloc)) {
+		ps->all.hits_len_alloc += ALLOC_DEPTHS;
+		ps->all.hits = MEM_reallocN(ps->all.hits, ps->all.hits_len_alloc * sizeof(*ps->all.hits));
+	}
+	DepthID *d = &ps->all.hits[ps->all.hits_len++];
+	d->id = id;
+	d->depth = depth_best;
+}
+
+static void gpu_select_load_id_pass_nearest(const DepthBufCache *rect_prev, const DepthBufCache *rect_curr)
+{
+	GPUPickState *ps = &g_pick_state;
+	const unsigned int id = rect_curr->id;
+	/* keep track each pixels ID in 'nearest.rect_id' */
+	if (id != SELECT_ID_NONE) {
+		unsigned int *id_ptr = ps->nearest.rect_id;
+
+		/* Check against DEPTH_MAX because XRAY will clear the buffer,
+		 * so previously set values will become unset.
+		 * In this case just leave those id's left as-is. */
+#define EVAL_TEST() \
+		if (depth_is_filled(prev, curr)) { \
+			*id_ptr = id; \
+		} ((void)0)
+
+		if (ps->is_cached == false) {
+			const depth_t *prev = rect_prev->buf;
+			const depth_t *curr = rect_curr->buf;
+			BLI_assert(ps->src.rect_len == ps->dst.rect_len);
+			const unsigned int rect_len = ps->src.rect_len;
+			for (unsigned int i = 0; i < rect_len; i++, curr++, prev++, id_ptr++) {
+				EVAL_TEST();
+			}
+		}
+		else {
+			/* same as above but different rect sizes */
+			const depth_t *prev = rect_prev->buf + ps->cache.sub_rect.start;
+			const depth_t *curr = rect_curr->buf + ps->cache.sub_rect.start;
+			for (unsigned int i = 0; i < ps->cache.sub_rect.span_len; i++) {
+				const depth_t *curr_end = curr + ps->cache.sub_rect.span;
+				for (; curr < curr_end; prev++, curr++, id_ptr++) {
+					EVAL_TEST();
+				}
+				prev += ps->cache.sub_rect.skip;
+				curr += ps->cache.sub_rect.skip;
+			}
+		}
+
+#undef EVAL_TEST
+	}
+}
+
+
+bool gpu_select_pick_load_id(unsigned int id)
+{
+	GPUPickState *ps = &g_pick_state;
+	if (ps->gl.is_init) {
+		const unsigned int rect_len = ps->src.rect_len;
+		glReadPixels(UNPACK4(ps->gl.clip_readpixels), GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, ps->gl.rect_depth_test->buf);
+		/* perform initial check since most cases the array remains unchanged  */
+
+		bool do_pass = false;
+		if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+			if (depth_buf_rect_depth_any(ps->gl.rect_depth_test, rect_len)) {
+				ps->gl.rect_depth_test->id = ps->gl.prev_id;
+				gpu_select_load_id_pass_all(ps->gl.rect_depth_test);
+				do_pass = true;
+			}
+		}
+		else {
+			if (depth_buf_rect_depth_any_filled(ps->gl.rect_depth, ps->gl.rect_depth_test, rect_len)) {
+				ps->gl.rect_depth_test->id = ps->gl.prev_id;
+				gpu_select_load_id_pass_nearest(ps->gl.rect_depth, ps->gl.rect_depth_test);
+				do_pass = true;
+			}
+		}
+
+		if (do_pass) {
+			/* Store depth in cache */
+			if (ps->use_cache) {
+				BLI_addtail(&ps->cache.bufs, ps->gl.rect_depth);
+				ps->gl.rect_depth = depth_buf_malloc(ps->src.rect_len);
+			}
+
+			SWAP(DepthBufCache *, ps->gl.rect_depth, ps->gl.rect_depth_test);
+
+			if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+				/* we want new depths every time */
+				glClear(GL_DEPTH_BUFFER_BIT);
+			}
+		}
+	}
+
+	ps->gl.is_init = true;
+	ps->gl.prev_id = id;
+
+	return true;
+}
+
+unsigned int gpu_select_pick_end(void)
+{
+	GPUPickState *ps = &g_pick_state;
+
+#ifdef DEBUG_PRINT
+	printf("%s\n", __func__);
+#endif
+
+	if (ps->is_cached == false) {
+		if (ps->gl.is_init) {
+			/* force finishing last pass */
+			gpu_select_pick_load_id(ps->gl.prev_id);
+		}
+
+		glPopAttrib();
+		glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+	}
+
+	/* assign but never free directly since it may be in cache */
+	DepthBufCache *rect_depth_final;
+
+	/* Store depth in cache */
+	if (ps->use_cache && !ps->is_cached) {
+		BLI_addtail(&ps->cache.bufs, ps->gl.rect_depth);
+		ps->gl.rect_depth = NULL;
+		rect_depth_final = ps->cache.bufs.last;
+	}
+	else if (ps->is_cached) {
+		rect_depth_final = ps->cache.bufs.last;
+	}
+	else {
+		/* common case, no cache */
+		rect_depth_final = ps->gl.rect_depth;
+	}
+
+	unsigned int maxhits = g_pick_state.bufsize;
+	DepthID *depth_data;
+	unsigned int depth_data_len = 0;
+
+	if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+		depth_data = ps->all.hits;
+		depth_data_len = ps->all.hits_len;
+		/* move ownership */
+		ps->all.hits = NULL;
+		ps->all.hits_len = 0;
+		ps->all.hits_len_alloc = 0;
+	}
+	else {
+		/* GPU_SELECT_PICK_NEAREST */
+
+		/* Over alloc (unlikely we have as many depths as pixels) */
+		unsigned int depth_data_len_first_pass = 0;
+		depth_data = MEM_mallocN(ps->dst.rect_len * sizeof(*depth_data), __func__);
+
+		/* Partially de-duplicating copy,
+		 * when contiguous ID's are found - update their closest depth.
+		 * This isn't essential but means there is less data to sort. */
+
+#define EVAL_TEST(i_src, i_dst) \
+		{ \
+			const unsigned int id = ps->nearest.rect_id[i_dst]; \
+			if (id != SELECT_ID_NONE) { \
+				const depth_t depth = rect_depth_final->buf[i_src]; \
+				if (depth_last == NULL || depth_last->id != id) { \
+					DepthID *d = &depth_data[depth_data_len_first_pass++]; \
+					d->id = id; \
+					d->depth = depth; \
+				} \
+				else if (depth_last->depth > depth) { \
+					depth_last->depth = depth; \
+				} \
+			} \
+		} ((void)0)
+
+		{
+			DepthID *depth_last = NULL;
+			if (ps->is_cached == false) {
+				for (unsigned int i = 0; i < ps->src.rect_len; i++) {
+					EVAL_TEST(i, i);
+				}
+			}
+			else {
+				/* same as above but different rect sizes */
+				unsigned int i_src = ps->cache.sub_rect.start, i_dst = 0;
+				for (unsigned int j = 0; j < ps->cache.sub_rect.span_len; j++) {
+					const unsigned int i_src_end = i_src + ps->cache.sub_rect.span;
+					for (; i_src < i_src_end; i_src++, i_dst++) {
+						EVAL_TEST(i_src, i_dst);
+					}
+					i_src += ps->cache.sub_rect.skip;
+				}
+			}
+		}
+
+#undef EVAL_TEST
+
+		qsort(depth_data, depth_data_len_first_pass, sizeof(DepthID), depth_id_cmp);
+
+		/* Sort by ID's then keep the best depth for each ID */
+		depth_data_len = 0;
+		{
+			DepthID *depth_last = NULL;
+			for (unsigned int i = 0; i < depth_data_len_first_pass; i++) {
+				if (depth_last == NULL || depth_last->id != depth_data[i].id) {
+					depth_last = &depth_data[depth_data_len++];
+					*depth_last = depth_data[i];
+				}
+				else if (depth_last->depth > depth_data[i].depth) {
+					depth_last->depth = depth_data[i].depth;
+				}
+			}
+		}
+	}
+
+	/* Finally sort each unique (id, depth) pair by depth
+	 * so the final hit-list is sorted by depth (nearest first) */
+	unsigned int hits = 0;
+
+	if (depth_data_len > maxhits) {
+		hits = (unsigned int)-1;
+	}
+	else {
+		/* leave sorting up to the caller */
+		qsort(depth_data, depth_data_len, sizeof(DepthID), depth_cmp);
+
+		for (unsigned int i = 0; i < depth_data_len; i++) {
+#ifdef DEBUG_PRINT
+			printf("  hit: %u: depth %u\n", depth_data[i].id,  depth_data[i].depth);
+#endif
+			/* first 3 are dummy values */
+			g_pick_state.buffer[hits][0] = 1;
+			g_pick_state.buffer[hits][1] = 0x0;  /* depth_data[i].depth; */ /* unused */
+			g_pick_state.buffer[hits][2] = 0x0;  /* z-far is currently never used. */
+			g_pick_state.buffer[hits][3] = depth_data[i].id;
+			hits++;
+		}
+		BLI_assert(hits < maxhits);
+	}
+
+	MEM_freeN(depth_data);
+
+	MEM_SAFE_FREE(ps->gl.rect_depth);
+	MEM_SAFE_FREE(ps->gl.rect_depth_test);
+
+	if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+		/* 'hits' already freed as 'depth_data' */
+	}
+	else {
+		MEM_freeN(ps->nearest.rect_id);
+		ps->nearest.rect_id = NULL;
+	}
+
+	if (ps->use_cache) {
+		ps->is_cached = true;
+	}
+
+	return hits;
+}
+
+/* ----------------------------------------------------------------------------
+ * Caching
+ *
+ * Support multiple begin/end's reusing depth buffers.
+ */
+
+void gpu_select_pick_cache_begin(void)
+{
+	BLI_assert(g_pick_state.use_cache == false);
+#ifdef DEBUG_PRINT
+	printf("%s\n", __func__);
+#endif
+	g_pick_state.use_cache = true;
+	g_pick_state.is_cached = false;
+}
+
+void gpu_select_pick_cache_end(void)
+{
+#ifdef DEBUG_PRINT
+	printf("%s: with %d buffers\n", __func__, BLI_listbase_count(&g_pick_state.cache.bufs));
+#endif
+	g_pick_state.use_cache = false;
+	g_pick_state.is_cached = false;
+
+	BLI_freelistN(&g_pick_state.cache.bufs);
+}
+
+/* is drawing needed? */
+bool gpu_select_pick_is_cached(void)
+{
+	return g_pick_state.is_cached;
+}
+
+void gpu_select_pick_cache_load_id(void)
+{
+	BLI_assert(g_pick_state.is_cached == true);
+	GPUPickState *ps = &g_pick_state;
+#ifdef DEBUG_PRINT
+	printf("%s (building depth from cache)\n", __func__);
+#endif
+	for (DepthBufCache *rect_depth = ps->cache.bufs.first; rect_depth; rect_depth = rect_depth->next) {
+		if (rect_depth->next != NULL) {
+			/* we know the buffers differ, but this sub-region may not.
+			 * double check before adding an id-pass */
+			if (g_pick_state.mode == GPU_SELECT_PICK_ALL) {
+				if (depth_buf_subrect_depth_any(rect_depth->next, &ps->cache.sub_rect)) {
+					gpu_select_load_id_pass_all(rect_depth->next);
+				}
+			}
+			else {
+				if (depth_buf_subrect_depth_any_filled(rect_depth, rect_depth->next, &ps->cache.sub_rect)) {
+					gpu_select_load_id_pass_nearest(rect_depth, rect_depth->next);
+				}
+			}
+		}
+	}
+}
diff --git a/source/blender/gpu/intern/gpu_select_private.h b/source/blender/gpu/intern/gpu_select_private.h
new file mode 100644
index 00000000000..8935bd7b253
--- /dev/null
+++ b/source/blender/gpu/intern/gpu_select_private.h
@@ -0,0 +1,53 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2014 Blender Foundation.
+ * All rights reserved.
+ *
+ * Contributor(s): Antony Riakiotakis.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/gpu/intern/gpu_select_private.h
+ *  \ingroup gpu
+ *
+ * Selection implementations.
+ */
+
+#ifndef __GPU_SELECT_PRIVATE_H__
+#define __GPU_SELECT_PRIVATE_H__
+
+/* gpu_select_pick */
+void gpu_select_pick_begin(unsigned int (*buffer)[4], unsigned int bufsize, const rcti *input, char mode);
+bool gpu_select_pick_load_id(unsigned int id);
+unsigned int gpu_select_pick_end(void);
+
+void gpu_select_pick_cache_begin(void);
+void gpu_select_pick_cache_end(void);
+bool gpu_select_pick_is_cached(void);
+void gpu_select_pick_cache_load_id(void);
+
+/* gpu_select_sample_query */
+void gpu_select_query_begin(unsigned int (*buffer)[4], unsigned int bufsize, const rcti *input, char mode, int oldhits);
+bool gpu_select_query_load_id(unsigned int id);
+unsigned int gpu_select_query_end(void);
+
+
+#define SELECT_ID_NONE ((unsigned int)0xffffffff)
+
+#endif  /* __GPU_SELECT_PRIVATE_H__ */
diff --git a/source/blender/gpu/intern/gpu_select_sample_query.c b/source/blender/gpu/intern/gpu_select_sample_query.c
new file mode 100644
index 00000000000..ba5fefc5227
--- /dev/null
+++ b/source/blender/gpu/intern/gpu_select_sample_query.c
@@ -0,0 +1,209 @@
+/*
+ * ***** BEGIN GPL LICENSE BLOCK *****
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2014 Blender Foundation.
+ * All rights reserved.
+ *
+ * Contributor(s): Antony Riakiotakis.
+ *
+ * ***** END GPL LICENSE BLOCK *****
+ */
+
+/** \file blender/gpu/intern/gpu_select_sample_query.c
+ *  \ingroup gpu
+ *
+ * Interface for accessing gpu-related methods for selection. The semantics will be
+ * similar to glRenderMode(GL_SELECT) since the goal is to maintain compatibility.
+ */
+
+#include <stdlib.h>
+
+#include "GPU_select.h"
+#include "GPU_extensions.h"
+#include "GPU_glew.h"
+ 
+#include "MEM_guardedalloc.h"
+
+#include "BLI_rect.h"
+
+#include "BLI_utildefines.h"
+
+#include "gpu_select_private.h"
+
+
+/* Ad hoc number of queries to allocate to skip doing many glGenQueries */
+#define ALLOC_QUERIES 200
+
+typedef struct GPUQueryState {
+	/* Tracks whether a query has been issued so that gpu_load_id can end the previous one */
+	bool query_issued;
+	/* array holding the OpenGL query identifiers */
+	unsigned int *queries;
+	/* array holding the id corresponding to each query */
+	unsigned int *id;
+	/* number of queries in *queries and *id */
+	unsigned int num_of_queries;
+	/* index to the next query to start */
+	unsigned int active_query;
+	/* cache on initialization */
+	unsigned int (*buffer)[4];
+	/* buffer size (stores number of integers, for actual size multiply by sizeof integer)*/
+	unsigned int bufsize;
+	/* mode of operation */
+	char mode;
+	unsigned int index;
+	int oldhits;
+} GPUQueryState;
+
+static GPUQueryState g_query_state = {0};
+
+
+void gpu_select_query_begin(
+        unsigned int (*buffer)[4], unsigned int bufsize,
+        const rcti *input, char mode,
+        int oldhits)
+{
+	float viewport[4];
+
+	g_query_state.query_issued = false;
+	g_query_state.active_query = 0;
+	g_query_state.num_of_queries = 0;
+	g_query_state.bufsize = bufsize;
+	g_query_state.buffer = buffer;
+	g_query_state.mode = mode;
+	g_query_state.index = 0;
+	g_query_state.oldhits = oldhits;
+
+	g_query_state.num_of_queries = ALLOC_QUERIES;
+
+	g_query_state.queries = MEM_mallocN(g_query_state.num_of_queries * sizeof(*g_query_state.queries), "gpu selection queries");
+	g_query_state.id = MEM_mallocN(g_query_state.num_of_queries * sizeof(*g_query_state.id), "gpu selection ids");
+	glGenQueries(g_query_state.num_of_queries, g_query_state.queries);
+
+	glPushAttrib(GL_DEPTH_BUFFER_BIT | GL_VIEWPORT_BIT);
+	/* disable writing to the framebuffer */
+	glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+
+	/* In order to save some fill rate we minimize the viewport using rect.
+	 * We need to get the region of the scissor so that our geometry doesn't
+	 * get rejected before the depth test. Should probably cull rect against
+	 * scissor for viewport but this is a rare case I think */
+	glGetFloatv(GL_SCISSOR_BOX, viewport);
+	glViewport(viewport[0], viewport[1], BLI_rcti_size_x(input), BLI_rcti_size_y(input));
+
+	/* occlusion queries operates on fragments that pass tests and since we are interested on all
+	 * objects in the view frustum independently of their order, we need to disable the depth test */
+	if (mode == GPU_SELECT_ALL) {
+		glDisable(GL_DEPTH_TEST);
+		glDepthMask(GL_FALSE);
+	}
+	else if (mode == GPU_SELECT_NEAREST_FIRST_PASS) {
+		glClear(GL_DEPTH_BUFFER_BIT);
+		glEnable(GL_DEPTH_TEST);
+		glDepthMask(GL_TRUE);
+		glDepthFunc(GL_LEQUAL);
+	}
+	else if (mode == GPU_SELECT_NEAREST_SECOND_PASS) {
+		glEnable(GL_DEPTH_TEST);
+		glDepthMask(GL_FALSE);
+		glDepthFunc(GL_EQUAL);
+	}
+}
+
+bool gpu_select_query_load_id(unsigned int id)
+{
+	if (g_query_state.query_issued) {
+		glEndQuery(GL_SAMPLES_PASSED);
+	}
+	/* if required, allocate extra queries */
+	if (g_query_state.active_query == g_query_state.num_of_queries) {
+		g_query_state.num_of_queries += ALLOC_QUERIES;
+		g_query_state.queries = MEM_reallocN(g_query_state.queries, g_query_state.num_of_queries * sizeof(*g_query_state.queries));
+		g_query_state.id = MEM_reallocN(g_query_state.id, g_query_state.num_of_queries * sizeof(*g_query_state.id));
+		glGenQueries(ALLOC_QUERIES, &g_query_state.queries[g_query_state.active_query]);
+	}
+
+	glBeginQuery(GL_SAMPLES_PASSED, g_query_state.queries[g_query_state.active_query]);
+	g_query_state.id[g_query_state.active_query] = id;
+	g_query_state.active_query++;
+	g_query_state.query_issued = true;
+
+	if (g_query_state.mode == GPU_SELECT_NEAREST_SECOND_PASS && g_query_state.index < g_query_state.oldhits) {
+		if (g_query_state.buffer[g_query_state.index][3] == id) {
+			g_query_state.index++;
+			return true;
+		}
+		else {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+unsigned int gpu_select_query_end(void)
+{
+	int i;
+
+	unsigned int hits = 0;
+	const unsigned int maxhits = g_query_state.bufsize;
+
+	if (g_query_state.query_issued) {
+		glEndQuery(GL_SAMPLES_PASSED);
+	}
+
+	for (i = 0; i < g_query_state.active_query; i++) {
+		unsigned int result;
+		glGetQueryObjectuiv(g_query_state.queries[i], GL_QUERY_RESULT, &result);
+		if (result > 0) {
+			if (g_query_state.mode != GPU_SELECT_NEAREST_SECOND_PASS) {
+
+				if (hits < maxhits) {
+					g_query_state.buffer[hits][0] = 1;
+					g_query_state.buffer[hits][1] = 0xFFFF;
+					g_query_state.buffer[hits][2] = 0xFFFF;
+					g_query_state.buffer[hits][3] = g_query_state.id[i];
+
+					hits++;
+				}
+				else {
+					hits = -1;
+					break;
+				}
+			}
+			else {
+				int j;
+				/* search in buffer and make selected object first */
+				for (j = 0; j < g_query_state.oldhits; j++) {
+					if (g_query_state.buffer[j][3] == g_query_state.id[i]) {
+						g_query_state.buffer[j][1] = 0;
+						g_query_state.buffer[j][2] = 0;
+					}
+				}
+				break;
+			}
+		}
+	}
+
+	glDeleteQueries(g_query_state.num_of_queries, g_query_state.queries);
+	MEM_freeN(g_query_state.queries);
+	MEM_freeN(g_query_state.id);
+	glPopAttrib();
+	glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+
+	return hits;
+}
diff --git a/source/blender/imbuf/IMB_colormanagement.h b/source/blender/imbuf/IMB_colormanagement.h
index 52febe642a0..4164f5fa75d 100644
--- a/source/blender/imbuf/IMB_colormanagement.h
+++ b/source/blender/imbuf/IMB_colormanagement.h
@@ -77,6 +77,16 @@ void IMB_colormanagement_transform(float *buffer, int width, int height, int cha
                                    const char *from_colorspace, const char *to_colorspace, bool predivide);
 void IMB_colormanagement_transform_threaded(float *buffer, int width, int height, int channels,
                                             const char *from_colorspace, const char *to_colorspace, bool predivide);
+void IMB_colormanagement_transform_byte(unsigned char *buffer, int width, int height, int channels,
+                                        const char *from_colorspace, const char *to_colorspace);
+void IMB_colormanagement_transform_byte_threaded(unsigned char *buffer, int width, int height, int channels,
+                                                 const char *from_colorspace, const char *to_colorspace);
+void IMB_colormanagement_transform_from_byte(float *float_buffer, unsigned char *byte_buffer,
+                                             int width, int height, int channels,
+                                             const char *from_colorspace, const char *to_colorspace);
+void IMB_colormanagement_transform_from_byte_threaded(float *float_buffer, unsigned char *byte_buffer,
+                                                      int width, int height, int channels,
+                                                      const char *from_colorspace, const char *to_colorspace);
 void IMB_colormanagement_transform_v4(float pixel[4], const char *from_colorspace, const char *to_colorspace);
 
 void IMB_colormanagement_colorspace_to_scene_linear_v3(float pixel[3], struct ColorSpace *colorspace);
@@ -185,6 +195,8 @@ void IMB_colormanagement_processor_apply_v3(struct ColormanageProcessor *cm_proc
 void IMB_colormanagement_processor_apply_pixel(struct ColormanageProcessor *cm_processor, float *pixel, int channels);
 void IMB_colormanagement_processor_apply(struct ColormanageProcessor *cm_processor, float *buffer, int width, int height,
                                          int channels, bool predivide);
+void IMB_colormanagement_processor_apply_byte(struct ColormanageProcessor *cm_processor,
+                                              unsigned char *buffer, int width, int height, int channels);
 void IMB_colormanagement_processor_free(struct ColormanageProcessor *cm_processor);
 
 /* ** OpenGL drawing routines using GLSL for color space transform ** */
diff --git a/source/blender/imbuf/intern/colormanagement.c b/source/blender/imbuf/intern/colormanagement.c
index a45346279d9..48cba3e0800 100644
--- a/source/blender/imbuf/intern/colormanagement.c
+++ b/source/blender/imbuf/intern/colormanagement.c
@@ -1555,21 +1555,25 @@ static void colormanage_display_buffer_process(ImBuf *ibuf, unsigned char *displ
 
 typedef struct ProcessorTransformThread {
 	ColormanageProcessor *cm_processor;
-	float *buffer;
+	unsigned char *byte_buffer;
+	float *float_buffer;
 	int width;
 	int start_line;
 	int tot_line;
 	int channels;
 	bool predivide;
+	bool float_from_byte;
 } ProcessorTransformThread;
 
 typedef struct ProcessorTransformInit {
 	ColormanageProcessor *cm_processor;
-	float *buffer;
+	unsigned char *byte_buffer;
+	float *float_buffer;
 	int width;
 	int height;
 	int channels;
 	bool predivide;
+	bool float_from_byte;
 } ProcessorTransformInitData;
 
 static void processor_transform_init_handle(void *handle_v, int start_line, int tot_line, void *init_data_v)
@@ -1577,17 +1581,24 @@ static void processor_transform_init_handle(void *handle_v, int start_line, int
 	ProcessorTransformThread *handle = (ProcessorTransformThread *) handle_v;
 	ProcessorTransformInitData *init_data = (ProcessorTransformInitData *) init_data_v;
 
-	int channels = init_data->channels;
-	int width = init_data->width;
-	bool predivide = init_data->predivide;
+	const int channels = init_data->channels;
+	const int width = init_data->width;
+	const bool predivide = init_data->predivide;
+	const bool float_from_byte = init_data->float_from_byte;
 
-	size_t offset = ((size_t)channels) * start_line * width;
+	const size_t offset = ((size_t)channels) * start_line * width;
 
 	memset(handle, 0, sizeof(ProcessorTransformThread));
 
 	handle->cm_processor = init_data->cm_processor;
 
-	handle->buffer = init_data->buffer + offset;
+	if (init_data->byte_buffer != NULL) {
+		/* TODO(serge): Offset might be different for byte and float buffers. */
+		handle->byte_buffer = init_data->byte_buffer + offset;
+	}
+	if (init_data->float_buffer != NULL) {
+		handle->float_buffer = init_data->float_buffer + offset;
+	}
 
 	handle->width = width;
 
@@ -1596,33 +1607,62 @@ static void processor_transform_init_handle(void *handle_v, int start_line, int
 
 	handle->channels = channels;
 	handle->predivide = predivide;
+	handle->float_from_byte = float_from_byte;
 }
 
 static void *do_processor_transform_thread(void *handle_v)
 {
 	ProcessorTransformThread *handle = (ProcessorTransformThread *) handle_v;
-	float *buffer = handle->buffer;
-	int channels = handle->channels;
-	int width = handle->width;
-	int height = handle->tot_line;
-	bool predivide = handle->predivide;
-
-	IMB_colormanagement_processor_apply(handle->cm_processor, buffer, width, height, channels, predivide);
+	unsigned char *byte_buffer = handle->byte_buffer;
+	float *float_buffer = handle->float_buffer;
+	const int channels = handle->channels;
+	const int width = handle->width;
+	const int height = handle->tot_line;
+	const bool predivide = handle->predivide;
+	const bool float_from_byte = handle->float_from_byte;
+
+	if (float_from_byte) {
+		IMB_buffer_float_from_byte(float_buffer, byte_buffer,
+		                           IB_PROFILE_SRGB, IB_PROFILE_SRGB,
+		                           true,
+		                           width, height, width, width);
+			IMB_colormanagement_processor_apply(handle->cm_processor,
+			                                    float_buffer,
+			                                    width, height, channels,
+			                                    predivide);
+	}
+	else {
+		if (byte_buffer != NULL) {
+			IMB_colormanagement_processor_apply_byte(handle->cm_processor,
+			                                         byte_buffer,
+			                                         width, height, channels);
+		}
+		if (float_buffer != NULL) {
+			IMB_colormanagement_processor_apply(handle->cm_processor,
+			                                    float_buffer,
+			                                    width, height, channels,
+			                                    predivide);
+		}
+	}
 
 	return NULL;
 }
 
-static void processor_transform_apply_threaded(float *buffer, int width, int height, int channels,
-                                               ColormanageProcessor *cm_processor, bool predivide)
+static void processor_transform_apply_threaded(unsigned char *byte_buffer, float *float_buffer,
+                                               const int width, const int height, const int channels,
+                                               ColormanageProcessor *cm_processor,
+                                               const bool predivide, const bool float_from_byte)
 {
 	ProcessorTransformInitData init_data;
 
 	init_data.cm_processor = cm_processor;
-	init_data.buffer = buffer;
+	init_data.byte_buffer = byte_buffer;
+	init_data.float_buffer = float_buffer;
 	init_data.width = width;
 	init_data.height = height;
 	init_data.channels = channels;
 	init_data.predivide = predivide;
+	init_data.float_from_byte = float_from_byte;
 
 	IMB_processor_apply_threaded(height, sizeof(ProcessorTransformThread), &init_data,
 	                             processor_transform_init_handle, do_processor_transform_thread);
@@ -1631,8 +1671,10 @@ static void processor_transform_apply_threaded(float *buffer, int width, int hei
 /*********************** Color space transformation functions *************************/
 
 /* convert the whole buffer from specified by name color space to another - internal implementation */
-static void colormanagement_transform_ex(float *buffer, int width, int height, int channels, const char *from_colorspace,
-                                         const char *to_colorspace, bool predivide, bool do_threaded)
+static void colormanagement_transform_ex(unsigned char *byte_buffer, float *float_buffer,
+                                         int width, int height, int channels,
+                                         const char *from_colorspace, const char *to_colorspace,
+                                         bool predivide, bool do_threaded)
 {
 	ColormanageProcessor *cm_processor;
 
@@ -1649,10 +1691,19 @@ static void colormanagement_transform_ex(float *buffer, int width, int height, i
 
 	cm_processor = IMB_colormanagement_colorspace_processor_new(from_colorspace, to_colorspace);
 
-	if (do_threaded)
-		processor_transform_apply_threaded(buffer, width, height, channels, cm_processor, predivide);
-	else
-		IMB_colormanagement_processor_apply(cm_processor, buffer, width, height, channels, predivide);
+	if (do_threaded) {
+		processor_transform_apply_threaded(byte_buffer, float_buffer,
+		                                   width, height, channels,
+		                                   cm_processor, predivide, false);
+	}
+	else {
+		if (byte_buffer != NULL) {
+			IMB_colormanagement_processor_apply_byte(cm_processor, byte_buffer, width, height, channels);
+		}
+		if (float_buffer != NULL) {
+			IMB_colormanagement_processor_apply(cm_processor, float_buffer, width, height, channels, predivide);
+		}
+	}
 
 	IMB_colormanagement_processor_free(cm_processor);
 }
@@ -1661,7 +1712,7 @@ static void colormanagement_transform_ex(float *buffer, int width, int height, i
 void IMB_colormanagement_transform(float *buffer, int width, int height, int channels,
                                    const char *from_colorspace, const char *to_colorspace, bool predivide)
 {
-	colormanagement_transform_ex(buffer, width, height, channels, from_colorspace, to_colorspace, predivide, false);
+	colormanagement_transform_ex(NULL, buffer, width, height, channels, from_colorspace, to_colorspace, predivide, false);
 }
 
 /* convert the whole buffer from specified by name color space to another
@@ -1670,7 +1721,54 @@ void IMB_colormanagement_transform(float *buffer, int width, int height, int cha
 void IMB_colormanagement_transform_threaded(float *buffer, int width, int height, int channels,
                                             const char *from_colorspace, const char *to_colorspace, bool predivide)
 {
-	colormanagement_transform_ex(buffer, width, height, channels, from_colorspace, to_colorspace, predivide, true);
+	colormanagement_transform_ex(NULL, buffer, width, height, channels, from_colorspace, to_colorspace, predivide, true);
+}
+
+/* Similar to functions above, but operates on byte buffer. */
+void IMB_colormanagement_transform_byte(unsigned char *buffer, int width, int height, int channels,
+                                        const char *from_colorspace, const char *to_colorspace)
+{
+	colormanagement_transform_ex(buffer, NULL, width, height, channels, from_colorspace, to_colorspace, false, false);
+}
+void IMB_colormanagement_transform_byte_threaded(unsigned char *buffer, int width, int height, int channels,
+                                                 const char *from_colorspace, const char *to_colorspace)
+{
+	colormanagement_transform_ex(buffer, NULL, width, height, channels, from_colorspace, to_colorspace, false, true);
+}
+
+/* Similar to above, but gets float buffer from display one. */
+void IMB_colormanagement_transform_from_byte(float *float_buffer, unsigned char *byte_buffer,
+                                             int width, int height, int channels,
+                                             const char *from_colorspace, const char *to_colorspace)
+{
+	IMB_buffer_float_from_byte(float_buffer, byte_buffer,
+	                           IB_PROFILE_SRGB, IB_PROFILE_SRGB,
+	                           true,
+	                           width, height, width, width);
+	IMB_colormanagement_transform(float_buffer,
+	                              width, height, channels,
+	                              from_colorspace, to_colorspace,
+	                              true);
+}
+void IMB_colormanagement_transform_from_byte_threaded(float *float_buffer, unsigned char *byte_buffer,
+                                                      int width, int height, int channels,
+                                                      const char *from_colorspace, const char *to_colorspace)
+{
+	ColormanageProcessor *cm_processor;
+	if (from_colorspace == NULL || from_colorspace[0] == '\0') {
+		return;
+	}
+	if (STREQ(from_colorspace, to_colorspace)) {
+		/* If source and destination color spaces are identical, skip
+		 * threading overhead and simply do nothing
+		 */
+		return;
+	}
+	cm_processor = IMB_colormanagement_colorspace_processor_new(from_colorspace, to_colorspace);
+	processor_transform_apply_threaded(byte_buffer, float_buffer,
+	                                   width, height, channels,
+	                                   cm_processor, true, true);
+	IMB_colormanagement_processor_free(cm_processor);
 }
 
 void IMB_colormanagement_transform_v4(float pixel[4], const char *from_colorspace, const char *to_colorspace)
@@ -1974,12 +2072,14 @@ void IMB_colormanagement_buffer_make_display_space(float *buffer, unsigned char
 	size_t float_buffer_size = ((size_t)width) * height * channels * sizeof(float);
 	float *display_buffer_float = MEM_mallocN(float_buffer_size, "byte_buffer_make_display_space");
 
+	/* TODO(sergey): Convert float directly to byte buffer. */
+
 	memcpy(display_buffer_float, buffer, float_buffer_size);
 
 	cm_processor = IMB_colormanagement_display_processor_new(view_settings, display_settings);
 
-	processor_transform_apply_threaded(display_buffer_float, width, height, channels,
-	                                   cm_processor, true);
+	processor_transform_apply_threaded(NULL, display_buffer_float, width, height, channels,
+	                                   cm_processor, true, false);
 
 	IMB_buffer_byte_from_float(display_buffer, display_buffer_float,
 	                           channels, dither, IB_PROFILE_SRGB, IB_PROFILE_SRGB,
@@ -3100,6 +3200,25 @@ void IMB_colormanagement_processor_apply(ColormanageProcessor *cm_processor, flo
 	}
 }
 
+void IMB_colormanagement_processor_apply_byte(ColormanageProcessor *cm_processor,
+                                              unsigned char *buffer,
+                                              int width, int height, int channels)
+{
+	/* TODO(sergey): Would be nice to support arbitrary channels configurations,
+	 * but for now it's not so important.
+	 */
+	BLI_assert(channels == 4);
+	float pixel[4];
+	for (int y = 0; y < height; y++) {
+		for (int x = 0; x < width; x++) {
+			size_t offset = channels * (((size_t)y) * width + x);
+			rgba_uchar_to_float(pixel, buffer + offset);
+			IMB_colormanagement_processor_apply_v4(cm_processor, pixel);
+			rgba_float_to_uchar(buffer + offset, pixel);
+		}
+	}
+}
+
 void IMB_colormanagement_processor_free(ColormanageProcessor *cm_processor)
 {
 	if (cm_processor->curve_mapping)
diff --git a/source/blender/imbuf/intern/imbuf.h b/source/blender/imbuf/intern/imbuf.h
index 897a149a45c..90dad70fa61 100644
--- a/source/blender/imbuf/intern/imbuf.h
+++ b/source/blender/imbuf/intern/imbuf.h
@@ -67,8 +67,6 @@
 #  define BIG_LONG SWAP_LONG
 #endif
 
-typedef unsigned char uchar;
-
 #define IMB_DPI_DEFAULT 72.0f
 
 #endif	/* __IMBUF_H__ */
diff --git a/source/blender/makesdna/DNA_modifier_types.h b/source/blender/makesdna/DNA_modifier_types.h
index f95533a88f9..823a7f0812f 100644
--- a/source/blender/makesdna/DNA_modifier_types.h
+++ b/source/blender/makesdna/DNA_modifier_types.h
@@ -86,6 +86,7 @@ typedef enum ModifierType {
 	eModifierType_NormalEdit        = 50,
 	eModifierType_CorrectiveSmooth  = 51,
 	eModifierType_MeshSequenceCache = 52,
+	eModifierType_SurfaceDeform     = 53,
 	NUM_MODIFIER_TYPES
 } ModifierType;
 
@@ -276,6 +277,7 @@ typedef struct MirrorModifierData {
 	short axis  DNA_DEPRECATED; /* deprecated, use flag instead */
 	short flag;
 	float tolerance;
+	float uv_offset[2];
 	struct Object *mirror_ob;
 } MirrorModifierData;
 
@@ -1570,6 +1572,46 @@ enum {
 	MOD_MESHSEQ_READ_COLOR = (1 << 3),
 };
 
+typedef struct SDefBind {
+	unsigned int *vert_inds;
+	unsigned int numverts;
+	int mode;
+	float *vert_weights;
+	float normal_dist;
+	float influence;
+} SDefBind;
+
+typedef struct SDefVert {
+	SDefBind *binds;
+	unsigned int numbinds;
+	char pad[4];
+} SDefVert;
+
+typedef struct SurfaceDeformModifierData {
+	ModifierData modifier;
+
+	struct Object *target;	/* bind target object */
+	SDefVert *verts;		/* vertex bind data */
+	float falloff;
+	unsigned int numverts, numpoly;
+	int flags;
+	float mat[4][4];
+} SurfaceDeformModifierData;
+
+/* Surface Deform modifier flags */
+enum {
+	MOD_SDEF_BIND = (1 << 0),
+	MOD_SDEF_USES_LOOPTRI = (1 << 1),
+	MOD_SDEF_HAS_CONCAVE = (1 << 2),
+};
+
+/* Surface Deform vertex bind modes */
+enum {
+	MOD_SDEF_MODE_LOOPTRI = 0,
+	MOD_SDEF_MODE_NGON = 1,
+	MOD_SDEF_MODE_CENTROID = 2,
+};
+
 #define MOD_MESHSEQ_READ_ALL \
 	(MOD_MESHSEQ_READ_VERT | MOD_MESHSEQ_READ_POLY | MOD_MESHSEQ_READ_UV | MOD_MESHSEQ_READ_COLOR)
 
diff --git a/source/blender/makesdna/DNA_node_types.h b/source/blender/makesdna/DNA_node_types.h
index fd601e55550..47677e50451 100644
--- a/source/blender/makesdna/DNA_node_types.h
+++ b/source/blender/makesdna/DNA_node_types.h
@@ -668,7 +668,8 @@ typedef struct NodeScriptDict {
 /* qdn: glare node */
 typedef struct NodeGlare {
 	char quality, type, iter;
-	char angle, pad_c1, size, pad[2];
+	/* XXX angle is only kept for backward/forward compatibility, was used for two different things, see T50736. */
+	char angle DNA_DEPRECATED, pad_c1, size, star_45, streaks;
 	float colmod, mix, threshold, fade;
 	float angle_ofs, pad_f1;
 } NodeGlare;
diff --git a/source/blender/makesdna/DNA_object_force.h b/source/blender/makesdna/DNA_object_force.h
index 59acefeffe4..ed14c4b9311 100644
--- a/source/blender/makesdna/DNA_object_force.h
+++ b/source/blender/makesdna/DNA_object_force.h
@@ -372,6 +372,7 @@ typedef struct SoftBody {
 #define PFIELD_DO_ROTATION		(1<<15)
 #define PFIELD_GUIDE_PATH_WEIGHT (1<<16)	/* apply curve weights */
 #define PFIELD_SMOKE_DENSITY    (1<<17)		/* multiply smoke force by density */
+#define PFIELD_GRAVITATION		(1<<18)             /* used for (simple) force */
 
 /* pd->falloff */
 #define PFIELD_FALL_SPHERE		0
diff --git a/source/blender/makesdna/DNA_scene_types.h b/source/blender/makesdna/DNA_scene_types.h
index 8ee15ef21a3..918d0f00040 100644
--- a/source/blender/makesdna/DNA_scene_types.h
+++ b/source/blender/makesdna/DNA_scene_types.h
@@ -1716,6 +1716,7 @@ typedef struct Scene {
 #define SCER_LOCK_FRAME_SELECTION	(1<<1)
 	/* timeline/keyframe jumping - only selected items (on by default) */
 #define SCE_KEYS_NO_SELONLY	(1<<2)
+#define SCER_SHOW_SUBFRAME	(1<<3)
 
 /* mode (int now) */
 #define R_OSA			0x0001
diff --git a/source/blender/makesdna/DNA_userdef_types.h b/source/blender/makesdna/DNA_userdef_types.h
index 0ad4482708f..5e7e7366e35 100644
--- a/source/blender/makesdna/DNA_userdef_types.h
+++ b/source/blender/makesdna/DNA_userdef_types.h
@@ -467,13 +467,18 @@ typedef struct UserDef {
 	int audioformat;
 	int audiochannels;
 
-	int scrollback; /* console scrollback limit */
-	int dpi;		/* range 48-128? */
-	char node_margin; /* node insert offset (aka auto-offset) margin, but might be useful for later stuff as well */
+	int scrollback;     /* console scrollback limit */
+	int dpi;            /* range 48-128? */
+	float ui_scale;     /* interface scale */
+	int pad1;
+	char node_margin;   /* node insert offset (aka auto-offset) margin, but might be useful for later stuff as well */
 	char pad2;
 	short transopts;
 	short menuthreshold1, menuthreshold2;
-	
+
+	/* startup template */
+	char app_template[64];
+
 	struct ListBase themes;
 	struct ListBase uifonts;
 	struct ListBase uistyles;
@@ -497,7 +502,6 @@ typedef struct UserDef {
 	int prefetchframes;
 	float pad_rot_angle; /* control the rotation step of the view when PAD2, PAD4, PAD6&PAD8 is use */
 	short frameserverport;
-	short pad4;
 	short obcenter_dia;
 	short rvisize;			/* rotating view icon size */
 	short rvibright;		/* rotating view icon brightness */
@@ -509,6 +513,8 @@ typedef struct UserDef {
 	char  ipo_new;			/* interpolation mode for newly added F-Curves */
 	char  keyhandles_new;	/* handle types for newly added keyframes */
 	char  gpu_select_method;
+	char  gpu_select_pick_deph;
+	char  pad4;
 	char  view_frame_type;
 
 	int view_frame_keyframes; /* number of keyframes to zoom around current frame */
diff --git a/source/blender/makesrna/RNA_access.h b/source/blender/makesrna/RNA_access.h
index 66e6f30feeb..f9aaec69ce7 100644
--- a/source/blender/makesrna/RNA_access.h
+++ b/source/blender/makesrna/RNA_access.h
@@ -598,6 +598,7 @@ extern StructRNA RNA_StucciTexture;
 extern StructRNA RNA_SubsurfModifier;
 extern StructRNA RNA_SunLamp;
 extern StructRNA RNA_SurfaceCurve;
+extern StructRNA RNA_SurfaceDeformModifier;
 extern StructRNA RNA_SurfaceModifier;
 extern StructRNA RNA_TexMapping;
 extern StructRNA RNA_Text;
diff --git a/source/blender/makesrna/intern/rna_action.c b/source/blender/makesrna/intern/rna_action.c
index 0c4c7ddac81..d398ce95a52 100644
--- a/source/blender/makesrna/intern/rna_action.c
+++ b/source/blender/makesrna/intern/rna_action.c
@@ -34,6 +34,8 @@
 
 #include "BLI_utildefines.h"
 
+#include "BLT_translation.h"
+
 #include "BKE_action.h"
 
 #include "RNA_access.h"
@@ -731,6 +733,7 @@ static void rna_def_action(BlenderRNA *brna)
 	RNA_def_property_ui_text(prop, "ID Root Type",
 	                         "Type of ID block that action can be used on - "
 	                         "DO NOT CHANGE UNLESS YOU KNOW WHAT YOU ARE DOING");
+	RNA_def_property_translation_context(prop, BLT_I18NCONTEXT_ID_ID);
 	
 	/* API calls */
 	RNA_api_action(srna);
diff --git a/source/blender/makesrna/intern/rna_animation.c b/source/blender/makesrna/intern/rna_animation.c
index 9adbf5f6b2e..f271bccd326 100644
--- a/source/blender/makesrna/intern/rna_animation.c
+++ b/source/blender/makesrna/intern/rna_animation.c
@@ -32,6 +32,8 @@
 
 #include "BLI_utildefines.h"
 
+#include "BLT_translation.h"
+
 #include "MEM_guardedalloc.h"
 
 #include "RNA_access.h"
@@ -742,6 +744,7 @@ static void rna_def_keyingset_path(BlenderRNA *brna)
 	RNA_def_property_enum_default(prop, ID_OB);
 	RNA_def_property_enum_funcs(prop, NULL, "rna_ksPath_id_type_set", NULL);
 	RNA_def_property_ui_text(prop, "ID Type", "Type of ID-block that can be used");
+	RNA_def_property_translation_context(prop, BLT_I18NCONTEXT_ID_ID);
 	RNA_def_property_update(prop, NC_SCENE | ND_KEYINGSET | NA_EDITED, NULL); /* XXX: maybe a bit too noisy */
 	
 	/* Group */
diff --git a/source/blender/makesrna/intern/rna_armature.c b/source/blender/makesrna/intern/rna_armature.c
index 07d295c8bbc..891f5c43ca6 100644
--- a/source/blender/makesrna/intern/rna_armature.c
+++ b/source/blender/makesrna/intern/rna_armature.c
@@ -187,6 +187,9 @@ static void rna_Bone_select_update(Main *UNUSED(bmain), Scene *UNUSED(scene), Po
 	}
 	
 	WM_main_add_notifier(NC_GEOM | ND_DATA, id);
+
+	/* spaces that show animation data of the selected bone need updating */
+	WM_main_add_notifier(NC_ANIMATION | ND_ANIMCHAN, id);
 }
 
 static char *rna_Bone_path(PointerRNA *ptr)
diff --git a/source/blender/makesrna/intern/rna_define.c b/source/blender/makesrna/intern/rna_define.c
index dc97d39052b..1d232d2df39 100644
--- a/source/blender/makesrna/intern/rna_define.c
+++ b/source/blender/makesrna/intern/rna_define.c
@@ -3157,8 +3157,9 @@ int rna_parameter_size(PropertyRNA *parm)
 					StringPropertyRNA *sparm = (StringPropertyRNA *)parm;
 					return sizeof(char) * sparm->maxlength;
 				}
-				else
+				else {
 					return sizeof(char *);
+				}
 			case PROP_POINTER:
 			{
 #ifdef RNA_RUNTIME
diff --git a/source/blender/makesrna/intern/rna_fcurve.c b/source/blender/makesrna/intern/rna_fcurve.c
index 4acdee490b8..bccc47aa95d 100644
--- a/source/blender/makesrna/intern/rna_fcurve.c
+++ b/source/blender/makesrna/intern/rna_fcurve.c
@@ -35,6 +35,8 @@
 
 #include "BLI_math.h"
 
+#include "BLT_translation.h"
+
 #include "BKE_action.h"
 
 #include "RNA_access.h"
@@ -1448,6 +1450,7 @@ static void rna_def_drivertarget(BlenderRNA *brna)
 	RNA_def_property_enum_funcs(prop, NULL, "rna_DriverTarget_id_type_set", NULL);
 	RNA_def_property_editable_func(prop, "rna_DriverTarget_id_type_editable");
 	RNA_def_property_ui_text(prop, "ID Type", "Type of ID-block that can be used");
+	RNA_def_property_translation_context(prop, BLT_I18NCONTEXT_ID_ID);
 	RNA_def_property_update(prop, 0, "rna_DriverTarget_update_data");
 	
 	/* Target Properties - Property to Drive */
diff --git a/source/blender/makesrna/intern/rna_mesh_api.c b/source/blender/makesrna/intern/rna_mesh_api.c
index ff9873fb3d1..9b0a25560f9 100644
--- a/source/blender/makesrna/intern/rna_mesh_api.c
+++ b/source/blender/makesrna/intern/rna_mesh_api.c
@@ -209,6 +209,11 @@ static void rna_Mesh_flip_normals(Mesh *mesh)
 	DAG_id_tag_update(&mesh->id, 0);
 }
 
+static void rna_Mesh_split_faces(Mesh *mesh, int free_loop_normals)
+{
+	BKE_mesh_split_faces(mesh, free_loop_normals != 0);
+}
+
 #else
 
 void RNA_api_mesh(StructRNA *srna)
@@ -240,8 +245,10 @@ void RNA_api_mesh(StructRNA *srna)
 	func = RNA_def_function(srna, "free_normals_split", "rna_Mesh_free_normals_split");
 	RNA_def_function_ui_description(func, "Free split vertex normals");
 
-	func = RNA_def_function(srna, "split_faces", "BKE_mesh_split_faces");
+	func = RNA_def_function(srna, "split_faces", "rna_Mesh_split_faces");
 	RNA_def_function_ui_description(func, "Split faces based on the edge angle");
+	RNA_def_boolean(func, "free_loop_normals", 1, "Free Loop Notmals",
+	                "Free loop normals custom data layer");
 
 	func = RNA_def_function(srna, "calc_tangents", "rna_Mesh_calc_tangents");
 	RNA_def_function_flag(func, FUNC_USE_REPORTS);
diff --git a/source/blender/makesrna/intern/rna_modifier.c b/source/blender/makesrna/intern/rna_modifier.c
index c4f0db38a16..1a8dd05a7b5 100644
--- a/source/blender/makesrna/intern/rna_modifier.c
+++ b/source/blender/makesrna/intern/rna_modifier.c
@@ -105,6 +105,7 @@ EnumPropertyItem rna_enum_object_modifier_type_items[] = {
 	{eModifierType_Shrinkwrap, "SHRINKWRAP", ICON_MOD_SHRINKWRAP, "Shrinkwrap", ""},
 	{eModifierType_SimpleDeform, "SIMPLE_DEFORM", ICON_MOD_SIMPLEDEFORM, "Simple Deform", ""},
 	{eModifierType_Smooth, "SMOOTH", ICON_MOD_SMOOTH, "Smooth", ""},
+	{eModifierType_SurfaceDeform, "SURFACE_DEFORM", ICON_MOD_MESHDEFORM, "Surface Deform", ""},
 	{eModifierType_Warp, "WARP", ICON_MOD_WARP, "Warp", ""},
 	{eModifierType_Wave, "WAVE", ICON_MOD_WAVE, "Wave", ""},
 	{0, "", 0, N_("Simulate"), ""},
@@ -408,6 +409,8 @@ static StructRNA *rna_Modifier_refine(struct PointerRNA *ptr)
 			return &RNA_CorrectiveSmoothModifier;
 		case eModifierType_MeshSequenceCache:
 			return &RNA_MeshSequenceCacheModifier;
+		case eModifierType_SurfaceDeform:
+			return &RNA_SurfaceDeformModifier;
 		/* Default */
 		case eModifierType_None:
 		case eModifierType_ShapeKey:
@@ -573,6 +576,7 @@ RNA_MOD_OBJECT_SET(MeshDeform, object, OB_MESH);
 RNA_MOD_OBJECT_SET(NormalEdit, target, OB_EMPTY);
 RNA_MOD_OBJECT_SET(Shrinkwrap, target, OB_MESH);
 RNA_MOD_OBJECT_SET(Shrinkwrap, auxTarget, OB_MESH);
+RNA_MOD_OBJECT_SET(SurfaceDeform, target, OB_MESH);
 
 static void rna_HookModifier_object_set(PointerRNA *ptr, PointerRNA value)
 {
@@ -1131,6 +1135,11 @@ static int rna_CorrectiveSmoothModifier_is_bind_get(PointerRNA *ptr)
 	return (csmd->bind_coords != NULL);
 }
 
+static int rna_SurfaceDeformModifier_is_bound_get(PointerRNA *ptr)
+{
+	return (((SurfaceDeformModifierData *)ptr->data)->verts != NULL);
+}
+
 static void rna_MeshSequenceCache_object_path_update(Main *bmain, Scene *scene, PointerRNA *ptr)
 {
 #ifdef WITH_ALEMBIC
@@ -1517,6 +1526,20 @@ static void rna_def_modifier_mirror(BlenderRNA *brna)
 	RNA_def_property_ui_text(prop, "Mirror V", "Mirror the V texture coordinate around the 0.5 point");
 	RNA_def_property_update(prop, 0, "rna_Modifier_update");
 
+	prop = RNA_def_property(srna, "mirror_offset_u", PROP_FLOAT, PROP_FACTOR);
+	RNA_def_property_float_sdna(prop, NULL, "uv_offset[0]");
+	RNA_def_property_range(prop, -1, 1);
+	RNA_def_property_ui_range(prop, -1, 1, 2, 4);
+	RNA_def_property_ui_text(prop, "U Offset", "Amount to offset mirrored UVs from the 0.5 point on the U axis");
+	RNA_def_property_update(prop, 0, "rna_Modifier_update");
+
+	prop = RNA_def_property(srna, "mirror_offset_v", PROP_FLOAT, PROP_FACTOR);
+	RNA_def_property_float_sdna(prop, NULL, "uv_offset[1]");
+	RNA_def_property_range(prop, -1, 1);
+	RNA_def_property_ui_range(prop, -1, 1, 2, 4);
+	RNA_def_property_ui_text(prop, "V Offset", "Amount to offset mirrored UVs from the 0.5 point on the V axis");
+	RNA_def_property_update(prop, 0, "rna_Modifier_update");
+
 	prop = RNA_def_property(srna, "merge_threshold", PROP_FLOAT, PROP_DISTANCE);
 	RNA_def_property_float_sdna(prop, NULL, "tolerance");
 	RNA_def_property_range(prop, 0, FLT_MAX);
@@ -4702,6 +4725,33 @@ static void rna_def_modifier_normaledit(BlenderRNA *brna)
 	RNA_def_property_update(prop, 0, "rna_Modifier_update");
 }
 
+static void rna_def_modifier_surfacedeform(BlenderRNA *brna)
+{
+	StructRNA *srna;
+	PropertyRNA *prop;
+
+	srna = RNA_def_struct(brna, "SurfaceDeformModifier", "Modifier");
+	RNA_def_struct_ui_text(srna, "SurfaceDeform Modifier", "");
+	RNA_def_struct_sdna(srna, "SurfaceDeformModifierData");
+	RNA_def_struct_ui_icon(srna, ICON_MOD_MESHDEFORM);
+
+	prop = RNA_def_property(srna, "target", PROP_POINTER, PROP_NONE);
+	RNA_def_property_ui_text(prop, "Target", "Mesh object to deform with");
+	RNA_def_property_pointer_funcs(prop, NULL, "rna_SurfaceDeformModifier_target_set", NULL, "rna_Mesh_object_poll");
+	RNA_def_property_flag(prop, PROP_EDITABLE | PROP_ID_SELF_CHECK);
+	RNA_def_property_update(prop, 0, "rna_Modifier_dependency_update");
+
+	prop = RNA_def_property(srna, "falloff", PROP_FLOAT, PROP_NONE);
+	RNA_def_property_range(prop, 2.0f, 16.0f);
+	RNA_def_property_ui_text(prop, "Interpolation falloff", "Controls how much nearby polygons influence deformation");
+	RNA_def_property_update(prop, 0, "rna_Modifier_update");
+
+	prop = RNA_def_property(srna, "is_bound", PROP_BOOLEAN, PROP_NONE);
+	RNA_def_property_boolean_funcs(prop, "rna_SurfaceDeformModifier_is_bound_get", NULL);
+	RNA_def_property_ui_text(prop, "Bound", "Whether geometry has been bound to target mesh");
+	RNA_def_property_clear_flag(prop, PROP_EDITABLE);
+}
+
 void RNA_def_modifier(BlenderRNA *brna)
 {
 	StructRNA *srna;
@@ -4819,6 +4869,7 @@ void RNA_def_modifier(BlenderRNA *brna)
 	rna_def_modifier_datatransfer(brna);
 	rna_def_modifier_normaledit(brna);
 	rna_def_modifier_meshseqcache(brna);
+	rna_def_modifier_surfacedeform(brna);
 }
 
 #endif
diff --git a/source/blender/makesrna/intern/rna_nodetree.c b/source/blender/makesrna/intern/rna_nodetree.c
index 8afaf1f8fcb..c3477644979 100644
--- a/source/blender/makesrna/intern/rna_nodetree.c
+++ b/source/blender/makesrna/intern/rna_nodetree.c
@@ -3263,6 +3263,12 @@ static EnumPropertyItem node_script_mode_items[] = {
 	{0, NULL, 0, NULL, NULL}
 };
 
+static EnumPropertyItem node_principled_distribution_items[] = {
+	{ SHD_GLOSSY_GGX, "GGX", 0, "GGX", "" },
+	{ SHD_GLOSSY_MULTI_GGX, "MULTI_GGX", 0, "Multiscatter GGX", "" },
+	{ 0, NULL, 0, NULL, NULL }
+};
+
 /* -- Common nodes ---------------------------------------------------------- */
 
 static void def_group_input(StructRNA *srna)
@@ -3336,6 +3342,7 @@ static void def_frame(StructRNA *srna)
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 
 	RNA_def_struct_sdna_from(srna, "NodeFrame", "storage");
+	RNA_def_struct_translation_context(srna, BLT_I18NCONTEXT_ID_NODETREE);
 	
 	prop = RNA_def_property(srna, "shrink", PROP_BOOLEAN, PROP_NONE);
 	RNA_def_property_boolean_sdna(prop, NULL, "flag", NODE_FRAME_SHRINK);
@@ -4198,17 +4205,11 @@ static void def_glass(StructRNA *srna)
 
 static void def_principled(StructRNA *srna)
 {
-	static EnumPropertyItem prop_principled_distribution_items[] = {
-		{ SHD_GLOSSY_GGX, "GGX", 0, "GGX", "" },
-		{ SHD_GLOSSY_MULTI_GGX, "MULTI_GGX", 0, "Multiscatter GGX", "" },
-		{ 0, NULL, 0, NULL, NULL }
-	};
-
 	PropertyRNA *prop;
 
 	prop = RNA_def_property(srna, "distribution", PROP_ENUM, PROP_NONE);
 	RNA_def_property_enum_sdna(prop, NULL, "custom1");
-	RNA_def_property_enum_items(prop, prop_principled_distribution_items);
+	RNA_def_property_enum_items(prop, node_principled_distribution_items);
 	RNA_def_property_ui_text(prop, "Distribution", "");
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_ShaderNodePrincipled_update");
 }
@@ -5747,8 +5748,8 @@ static void def_cmp_glare(StructRNA *srna)
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 	
 	prop = RNA_def_property(srna, "streaks", PROP_INT, PROP_NONE);
-	RNA_def_property_int_sdna(prop, NULL, "angle");
-	RNA_def_property_range(prop, 2, 16);
+	RNA_def_property_int_sdna(prop, NULL, "streaks");
+	RNA_def_property_range(prop, 1, 16);
 	RNA_def_property_ui_text(prop, "Streaks", "Total number of streaks");
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 	
@@ -5765,7 +5766,7 @@ static void def_cmp_glare(StructRNA *srna)
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 	
 	prop = RNA_def_property(srna, "use_rotate_45", PROP_BOOLEAN, PROP_NONE);
-	RNA_def_property_boolean_sdna(prop, NULL, "angle", 0);
+	RNA_def_property_boolean_sdna(prop, NULL, "star_45", 0);
 	RNA_def_property_ui_text(prop, "Rotate 45", "Simple star filter: add 45 degree rotation offset");
 	RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 	
diff --git a/source/blender/makesrna/intern/rna_object.c b/source/blender/makesrna/intern/rna_object.c
index 0cffba47f16..b3c166a6810 100644
--- a/source/blender/makesrna/intern/rna_object.c
+++ b/source/blender/makesrna/intern/rna_object.c
@@ -1321,8 +1321,12 @@ static void rna_Object_active_constraint_set(PointerRNA *ptr, PointerRNA value)
 
 static bConstraint *rna_Object_constraints_new(Object *object, int type)
 {
+	bConstraint *new_con = BKE_constraint_add_for_object(object, NULL, type);
+
+	ED_object_constraint_tag_update(object, new_con);
 	WM_main_add_notifier(NC_OBJECT | ND_CONSTRAINT | NA_ADDED, object);
-	return BKE_constraint_add_for_object(object, NULL, type);
+
+	return new_con;
 }
 
 static void rna_Object_constraints_remove(Object *object, ReportList *reports, PointerRNA *con_ptr)
diff --git a/source/blender/makesrna/intern/rna_object_force.c b/source/blender/makesrna/intern/rna_object_force.c
index 1d89f7535c4..514fca1b011 100644
--- a/source/blender/makesrna/intern/rna_object_force.c
+++ b/source/blender/makesrna/intern/rna_object_force.c
@@ -1275,7 +1275,7 @@ static void rna_def_field(BlenderRNA *brna)
 	prop = RNA_def_property(srna, "falloff_power", PROP_FLOAT, PROP_NONE);
 	RNA_def_property_float_sdna(prop, NULL, "f_power");
 	RNA_def_property_range(prop, 0.0f, 10.0f);
-	RNA_def_property_ui_text(prop, "Falloff Power", "Falloff power (real gravitational falloff = 2)");
+	RNA_def_property_ui_text(prop, "Falloff Power", "");
 	RNA_def_property_update(prop, 0, "rna_FieldSettings_update");
 	
 	prop = RNA_def_property(srna, "distance_min", PROP_FLOAT, PROP_NONE);
@@ -1394,6 +1394,11 @@ static void rna_def_field(BlenderRNA *brna)
 	RNA_def_property_boolean_sdna(prop, NULL, "flag", PFIELD_SMOKE_DENSITY);
 	RNA_def_property_ui_text(prop, "Apply Density", "Adjust force strength based on smoke density");
 	RNA_def_property_update(prop, 0, "rna_FieldSettings_update");
+	prop = RNA_def_property(srna, "use_gravity_falloff", PROP_BOOLEAN, PROP_NONE);
+	RNA_def_property_boolean_sdna(prop, NULL, "flag", PFIELD_GRAVITATION);
+	RNA_def_property_ui_text(prop, "Gravity Falloff", "Multiply force by 1/distance²");
+	RNA_def_property_update(prop, 0, "rna_FieldSettings_update");
+
 	
 	/* Pointer */
 	
diff --git a/source/blender/makesrna/intern/rna_pose.c b/source/blender/makesrna/intern/rna_pose.c
index 28ce63a61bd..8d161466d56 100644
--- a/source/blender/makesrna/intern/rna_pose.c
+++ b/source/blender/makesrna/intern/rna_pose.c
@@ -524,12 +524,15 @@ static void rna_PoseChannel_active_constraint_set(PointerRNA *ptr, PointerRNA va
 	BKE_constraints_active_set(&pchan->constraints, (bConstraint *)value.data);
 }
 
-static bConstraint *rna_PoseChannel_constraints_new(bPoseChannel *pchan, int type)
+static bConstraint *rna_PoseChannel_constraints_new(ID *id, bPoseChannel *pchan, Main *main, int type)
 {
-	/*WM_main_add_notifier(NC_OBJECT|ND_CONSTRAINT|NA_ADDED, object); */
-	/* TODO, pass object also */
-	/* TODO, new pose bones don't have updated draw flags */
-	return BKE_constraint_add_for_pose(NULL, pchan, NULL, type);
+	Object *ob = (Object *)id;
+	bConstraint *new_con = BKE_constraint_add_for_pose(ob, pchan, NULL, type);
+
+	ED_object_constraint_dependency_tag_update(main, ob, new_con);
+	WM_main_add_notifier(NC_OBJECT | ND_CONSTRAINT | NA_ADDED, id);
+
+	return new_con;
 }
 
 static void rna_PoseChannel_constraints_remove(ID *id, bPoseChannel *pchan, ReportList *reports, PointerRNA *con_ptr)
@@ -764,6 +767,7 @@ static void rna_def_pose_channel_constraints(BlenderRNA *brna, PropertyRNA *cpro
 	/* Constraint collection */
 	func = RNA_def_function(srna, "new", "rna_PoseChannel_constraints_new");
 	RNA_def_function_ui_description(func, "Add a constraint to this object");
+	RNA_def_function_flag(func, FUNC_USE_MAIN | FUNC_USE_SELF_ID); /* ID and Main needed for refresh */
 	/* return type */
 	parm = RNA_def_pointer(func, "constraint", "Constraint", "", "New constraint");
 	RNA_def_function_return(func, parm);
diff --git a/source/blender/makesrna/intern/rna_scene.c b/source/blender/makesrna/intern/rna_scene.c
index 1166fb89a0a..899466f05df 100644
--- a/source/blender/makesrna/intern/rna_scene.c
+++ b/source/blender/makesrna/intern/rna_scene.c
@@ -411,7 +411,7 @@ EnumPropertyItem rna_enum_gpencil_interpolation_mode_items[] = {
 	/* interpolation */
 	{0, "", 0, N_("Interpolation"), "Standard transitions between keyframes"},
 	{GP_IPO_LINEAR,   "LINEAR", ICON_IPO_LINEAR, "Linear", "Straight-line interpolation between A and B (i.e. no ease in/out)"},
-	{GP_IPO_CURVEMAP, "CUSTOM", ICON_IPO_BEZIER, "Custom", "Custom interpolation defined using a curvemap"},
+	{GP_IPO_CURVEMAP, "CUSTOM", ICON_IPO_BEZIER, "Custom", "Custom interpolation defined using a curve map"},
 	
 	/* easing */
 	{0, "", 0, N_("Easing (by strength)"), "Predefined inertial transitions, useful for motion graphics (from least to most ''dramatic'')"},
@@ -792,6 +792,21 @@ static void rna_Scene_frame_current_set(PointerRNA *ptr, int value)
 	data->r.cfra = value;
 }
 
+static float rna_Scene_frame_float_get(PointerRNA *ptr)
+{
+	Scene *data = (Scene *)ptr->data;
+	return (float)data->r.cfra + data->r.subframe;
+}
+
+static void rna_Scene_frame_float_set(PointerRNA *ptr, float value)
+{
+	Scene *data = (Scene *)ptr->data;
+	/* if negative frames aren't allowed, then we can't use them */
+	FRAMENUMBER_MIN_CLAMP(value);
+	data->r.cfra = (int)value;
+	data->r.subframe = value - data->r.cfra;
+}
+
 static float rna_Scene_frame_current_final_get(PointerRNA *ptr)
 {
 	Scene *scene = (Scene *)ptr->data;
@@ -872,6 +887,12 @@ static void rna_Scene_preview_range_end_frame_set(PointerRNA *ptr, int value)
 	data->r.pefra = value;
 }
 
+static void rna_Scene_show_subframe_update(Main *UNUSED(bmain), Scene *UNUSED(current_scene), PointerRNA *ptr)
+{
+	Scene *scene = (Scene *)ptr->id.data;
+	scene->r.subframe = 0.0f;
+}
+
 static void rna_Scene_frame_update(Main *bmain, Scene *UNUSED(current_scene), PointerRNA *ptr)
 {
 	Scene *scene = (Scene *)ptr->id.data;
@@ -5369,7 +5390,7 @@ static void rna_def_scene_image_format_data(BlenderRNA *brna)
 	prop = RNA_def_property(srna, "jpeg2k_codec", PROP_ENUM, PROP_NONE);
 	RNA_def_property_enum_sdna(prop, NULL, "jp2_codec");
 	RNA_def_property_enum_items(prop, jp2_codec_items);
-	RNA_def_property_ui_text(prop, "Codec", "Codec settings for Jpek2000");
+	RNA_def_property_ui_text(prop, "Codec", "Codec settings for Jpeg2000");
 	RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
 #endif
 
@@ -7081,8 +7102,19 @@ void RNA_def_scene(BlenderRNA *brna)
 	prop = RNA_def_property(srna, "frame_subframe", PROP_FLOAT, PROP_TIME);
 	RNA_def_property_float_sdna(prop, NULL, "r.subframe");
 	RNA_def_property_ui_text(prop, "Current Sub-Frame", "");
-	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE | PROP_EDITABLE);
-	
+	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
+	RNA_def_property_range(prop, 0.0f, 1.0f);
+	RNA_def_property_ui_range(prop, 0.0f, 1.0f, 0.01, 2);
+	RNA_def_property_update(prop, NC_SCENE | ND_FRAME, "rna_Scene_frame_update");
+
+	prop = RNA_def_property(srna, "frame_float", PROP_FLOAT, PROP_TIME);
+	RNA_def_property_ui_text(prop, "Current Sub-Frame", "");
+	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
+	RNA_def_property_range(prop, MINAFRAME, MAXFRAME);
+	RNA_def_property_ui_range(prop, MINAFRAME, MAXFRAME, 0.1, 2);
+	RNA_def_property_float_funcs(prop, "rna_Scene_frame_float_get", "rna_Scene_frame_float_set", NULL);
+	RNA_def_property_update(prop, NC_SCENE | ND_FRAME, "rna_Scene_frame_update");
+
 	prop = RNA_def_property(srna, "frame_start", PROP_INT, PROP_TIME);
 	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
 	RNA_def_property_int_sdna(prop, NULL, "r.sfra");
@@ -7147,7 +7179,15 @@ void RNA_def_scene(BlenderRNA *brna)
 	RNA_def_property_int_funcs(prop, NULL, "rna_Scene_preview_range_end_frame_set", NULL);
 	RNA_def_property_ui_text(prop, "Preview Range End Frame", "Alternative end frame for UI playback");
 	RNA_def_property_update(prop, NC_SCENE | ND_FRAME, NULL);
-	
+
+	/* Subframe for moblur debug. */
+	prop = RNA_def_property(srna, "show_subframe", PROP_BOOLEAN, PROP_NONE);
+	RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
+	RNA_def_property_boolean_sdna(prop, NULL, "r.flag", SCER_SHOW_SUBFRAME);
+	RNA_def_property_ui_text(prop, "Show Subframe",
+	                         "Show current scene subframe and allow set it using interface tools");
+	RNA_def_property_update(prop, NC_SCENE | ND_FRAME, "rna_Scene_show_subframe_update");
+
 	/* Timeline / Time Navigation settings */
 	prop = RNA_def_property(srna, "show_keys_from_selected_only", PROP_BOOLEAN, PROP_NONE);
 	RNA_def_property_boolean_negative_sdna(prop, NULL, "flag", SCE_KEYS_NO_SELONLY);
diff --git a/source/blender/makesrna/intern/rna_scene_api.c b/source/blender/makesrna/intern/rna_scene_api.c
index fe781a309a4..f3146c9e6a8 100644
--- a/source/blender/makesrna/intern/rna_scene_api.c
+++ b/source/blender/makesrna/intern/rna_scene_api.c
@@ -243,9 +243,9 @@ static void rna_Scene_alembic_export(
 	    .use_subdiv_schema = use_subdiv_schema,
 	    .compression_type = compression_type,
 	    .packuv = packuv,
-		.triangulate = triangulate,
-		.quad_method = quad_method,
-		.ngon_method = ngon_method,
+	    .triangulate = triangulate,
+	    .quad_method = quad_method,
+	    .ngon_method = ngon_method,
 
 	    .global_scale = scale,
 	};
@@ -263,34 +263,55 @@ static void rna_Scene_alembic_export(
 /* don't remove this, as COLLADA exporting cannot be done through operators in render() callback. */
 #include "../../collada/collada.h"
 
+/* Note: This definition must match to the generated function call */
 static void rna_Scene_collada_export(
         Scene *scene,
-        const char *filepath,
+        const char *filepath, 
         int apply_modifiers,
-        int export_mesh_type,
 
+        int export_mesh_type,
         int selected,
         int include_children,
         int include_armatures,
         int include_shapekeys,
         int deform_bones_only,
-
         int active_uv_only,
         int include_uv_textures,
         int include_material_textures,
         int use_texture_copies,
-
-        int use_ngons,
+        int triangulate,
         int use_object_instantiation,
         int use_blender_profile,
         int sort_by_name,
+        int export_transformation_type,
         int open_sim,
-        int export_transformation_type)
+        int keep_bind_info)
 {
-	collada_export(scene, filepath, apply_modifiers, export_mesh_type, selected,
-	               include_children, include_armatures, include_shapekeys, deform_bones_only,
-	               active_uv_only, include_uv_textures, include_material_textures,
-	               use_texture_copies, use_ngons, use_object_instantiation, use_blender_profile, sort_by_name, export_transformation_type, open_sim);
+	collada_export(scene,
+		filepath,
+
+		apply_modifiers,
+		export_mesh_type,
+
+		selected,
+		include_children,
+		include_armatures,
+		include_shapekeys,
+		deform_bones_only,
+
+		active_uv_only,
+		include_uv_textures,
+		include_material_textures,
+		use_texture_copies,
+
+		triangulate,
+		use_object_instantiation,
+		use_blender_profile,
+		sort_by_name,
+
+		export_transformation_type,
+		open_sim,
+		keep_bind_info);
 }
 
 #endif
@@ -354,30 +375,56 @@ void RNA_api_scene(StructRNA *srna)
 	parm = RNA_def_string(func, "filepath", NULL, FILE_MAX, "File Path", "File path to write Collada file");
 	RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
 	RNA_def_property_subtype(parm, PROP_FILEPATH); /* allow non utf8 */
-	RNA_def_boolean(func, "apply_modifiers", 0, "Apply Modifiers", "Apply modifiers");
+
+	RNA_def_boolean(func, "apply_modifiers", false,
+	                "Apply Modifiers", "Apply modifiers to exported mesh (non destructive))");
+
 	RNA_def_int(func, "export_mesh_type", 0, INT_MIN, INT_MAX,
 	            "Resolution", "Modifier resolution for export", INT_MIN, INT_MAX);
-	RNA_def_boolean(func, "selected", 0, "Selection Only", "Export only selected elements");
-	RNA_def_boolean(func, "include_children", 0, "Include Children", "Export all children of selected objects (even if not selected)");
-	RNA_def_boolean(func, "include_armatures", 0, "Include Armatures", "Export related armatures (even if not selected)");
-	RNA_def_boolean(func, "include_shapekeys", 0, "Include Shape Keys", "Export all Shape Keys from Mesh Objects");
-	RNA_def_boolean(func, "deform_bones_only", 0, "Deform Bones only", "Only export deforming bones with armatures");
-
-	RNA_def_boolean(func, "active_uv_only", 0, "Active UV Layer only", "Export only the active UV Layer");
-	RNA_def_boolean(func, "include_uv_textures", 0, "Include UV Textures", "Export textures assigned to the object UV maps");
-	RNA_def_boolean(func, "include_material_textures", 0, "Include Material Textures", "Export textures assigned to the object Materials");
-	RNA_def_boolean(func, "use_texture_copies", 0, "copy", "Copy textures to same folder where the .dae file is exported");
-
-	RNA_def_boolean(func, "use_ngons", 1, "Use NGons", "Keep NGons in Export");
-	RNA_def_boolean(func, "use_object_instantiation", 1, "Use Object Instances", "Instantiate multiple Objects from same Data");
-	RNA_def_boolean(func, "use_blender_profile", 1, "Use Blender Profile", "Export additional Blender specific information (for material, shaders, bones, etc.)");
-	RNA_def_boolean(func, "sort_by_name", 0, "Sort by Object name", "Sort exported data by Object name");
-	RNA_def_boolean(func, "open_sim", 0, "Export for SL/OpenSim", "Compatibility mode for SL, OpenSim and similar online worlds");
+
+	RNA_def_boolean(func, "selected", false, "Selection Only", "Export only selected elements");
+
+	RNA_def_boolean(func, "include_children", false,
+	                "Include Children", "Export all children of selected objects (even if not selected)");
+
+	RNA_def_boolean(func, "include_armatures", false,
+	                "Include Armatures", "Export related armatures (even if not selected)");
+
+	RNA_def_boolean(func, "include_shapekeys", true, "Include Shape Keys", "Export all Shape Keys from Mesh Objects");
+
+	RNA_def_boolean(func, "deform_bones_only", false,
+	                "Deform Bones only", "Only export deforming bones with armatures");
+
+	RNA_def_boolean(func, "active_uv_only", false, "Only Selected UV Map", "Export only the selected UV Map");
+
+	RNA_def_boolean(func, "include_uv_textures", false,
+	                "Include UV Textures", "Export textures assigned to the object UV Maps");
+
+	RNA_def_boolean(func, "include_material_textures", false,
+	                "Include Material Textures", "Export textures assigned to the object Materials");
+
+	RNA_def_boolean(func, "use_texture_copies", true,
+	                "Copy", "Copy textures to same folder where the .dae file is exported");
+
+	RNA_def_boolean(func, "triangulate", true, "Triangulate", "Export Polygons (Quads & NGons) as Triangles");
+
+	RNA_def_boolean(func, "use_object_instantiation", true,
+	                "Use Object Instances", "Instantiate multiple Objects from same Data");
+
+	RNA_def_boolean(func, "use_blender_profile", true, "Use Blender Profile",
+	                "Export additional Blender specific information (for material, shaders, bones, etc.)");
+
+	RNA_def_boolean(func, "sort_by_name", false, "Sort by Object name", "Sort exported data by Object name");
 
 	RNA_def_int(func, "export_transformation_type", 0, INT_MIN, INT_MAX,
-	            "Transformation", "Transformation type for translation, scale and rotation", INT_MIN, INT_MAX);
+	            "Transform", "Transformation type for translation, scale and rotation", INT_MIN, INT_MAX);
+
+	RNA_def_boolean(func, "open_sim", false,
+	                "Export to SL/OpenSim", "Compatibility mode for SL, OpenSim and other compatible online worlds");
+
+	RNA_def_boolean(func, "keep_bind_info", false, "Keep Bind Info",
+	                "Store bind pose information in custom bone properties for later use during Collada export");
 
-	RNA_def_function_ui_description(func, "Export to collada file");
 #endif
 
 #ifdef WITH_ALEMBIC
diff --git a/source/blender/makesrna/intern/rna_smoke.c b/source/blender/makesrna/intern/rna_smoke.c
index 6db370fc152..c12937bd2bf 100644
--- a/source/blender/makesrna/intern/rna_smoke.c
+++ b/source/blender/makesrna/intern/rna_smoke.c
@@ -832,14 +832,14 @@ static void rna_def_smoke_domain_settings(BlenderRNA *brna)
 	    {FLUID_FIELD_COLOR_R, "COLOR_R", 0, "Red", "Red component of the color field"},
 	    {FLUID_FIELD_COLOR_G, "COLOR_G", 0, "Green", "Green component of the color field"},
 	    {FLUID_FIELD_COLOR_B, "COLOR_B", 0, "Blue", "Blue component of the color field"},
-		{FLUID_FIELD_DENSITY, "DENSITY", 0, "Density", "Quantity of soot in the fluid"},
+	    {FLUID_FIELD_DENSITY, "DENSITY", 0, "Density", "Quantity of soot in the fluid"},
 	    {FLUID_FIELD_FLAME, "FLAME", 0, "Flame", "Flame field"},
 	    {FLUID_FIELD_FUEL, "FUEL", 0, "Fuel", "Fuel field"},
 	    {FLUID_FIELD_HEAT, "HEAT", 0, "Heat", "Temperature of the fluid"},
 	    {FLUID_FIELD_VELOCITY_X, "VELOCITY_X", 0, "X Velocity", "X component of the velocity field"},
 	    {FLUID_FIELD_VELOCITY_Y, "VELOCITY_Y", 0, "Y Velocity", "Y component of the velocity field"},
 	    {FLUID_FIELD_VELOCITY_Z, "VELOCITY_Z", 0, "Z Velocity", "Z component of the velocity field"},
-		{0, NULL, 0, NULL, NULL}
+	    {0, NULL, 0, NULL, NULL}
 	};
 
 	prop = RNA_def_property(srna, "coba_field", PROP_ENUM, PROP_NONE);
diff --git a/source/blender/makesrna/intern/rna_userdef.c b/source/blender/makesrna/intern/rna_userdef.c
index e68e67586e9..4d70b87843a 100644
--- a/source/blender/makesrna/intern/rna_userdef.c
+++ b/source/blender/makesrna/intern/rna_userdef.c
@@ -138,23 +138,11 @@ static void rna_userdef_update(Main *UNUSED(bmain), Scene *UNUSED(scene), Pointe
 }
 
 /* also used by buffer swap switching */
-static void rna_userdef_dpi_update(Main *UNUSED(bmain), Scene *UNUSED(scene), PointerRNA *UNUSED(ptr))
+static void rna_userdef_dpi_update(Main *bmain, Scene *UNUSED(scene), PointerRNA *UNUSED(ptr))
 {
 	/* font's are stored at each DPI level, without this we can easy load 100's of fonts */
 	BLF_cache_clear();
 
-	BKE_blender_userdef_refresh();
-	WM_main_add_notifier(NC_WINDOW, NULL);      /* full redraw */
-	WM_main_add_notifier(NC_SCREEN | NA_EDITED, NULL);    /* refresh region sizes */
-}
-
-static void rna_userdef_virtual_pixel_update(Main *bmain, Scene *UNUSED(scene), PointerRNA *UNUSED(ptr))
-{
-	/* font's are stored at each DPI level, without this we can easy load 100's of fonts */
-	BLF_cache_clear();
-	
-	BKE_blender_userdef_refresh();
-
 	/* force setting drawable again */
 	wmWindowManager *wm = bmain->wm.first;
 	if (wm) {
@@ -3325,6 +3313,12 @@ static void rna_def_userdef_view(BlenderRNA *brna)
 	RNA_def_struct_ui_text(srna, "View & Controls", "Preferences related to viewing data");
 
 	/* View  */
+	prop = RNA_def_property(srna, "ui_scale", PROP_FLOAT, PROP_FACTOR);
+	RNA_def_property_ui_text(prop, "UI Scale", "Changes the size of the fonts and buttons in the interface");
+	RNA_def_property_range(prop, 0.25f, 4.0f);
+	RNA_def_property_ui_range(prop, 0.5f, 2.0f, 1, 1);
+	RNA_def_property_float_default(prop, 1.0f);
+	RNA_def_property_update(prop, 0, "rna_userdef_dpi_update");
 
 	/* display */
 	prop = RNA_def_property(srna, "show_tooltips", PROP_BOOLEAN, PROP_NONE);
@@ -3916,12 +3910,6 @@ static void rna_def_userdef_system(BlenderRNA *brna)
 	    {0, NULL, 0, NULL, NULL}
 	};
 
-	static EnumPropertyItem virtual_pixel_mode_items[] = {
-		{VIRTUAL_PIXEL_NATIVE, "NATIVE", 0, "Native", "Use native pixel size of the display"},
-		{VIRTUAL_PIXEL_DOUBLE, "DOUBLE", 0, "Double", "Use double the native pixel size of the display"},
-		{0, NULL, 0, NULL, NULL}
-	};
-
 	srna = RNA_def_struct(brna, "UserPreferencesSystem", NULL);
 	RNA_def_struct_sdna(srna, "UserDef");
 	RNA_def_struct_nested(brna, srna, "UserPreferences");
@@ -3936,16 +3924,8 @@ static void rna_def_userdef_system(BlenderRNA *brna)
 	RNA_def_property_update(prop, NC_WINDOW, "rna_userdef_language_update");
 
 	prop = RNA_def_property(srna, "dpi", PROP_INT, PROP_NONE);
-	RNA_def_property_int_sdna(prop, NULL, "dpi");
-	RNA_def_property_range(prop, 48, 144);
+	RNA_def_property_clear_flag(prop, PROP_EDITABLE);
 	RNA_def_property_ui_text(prop, "DPI", "Font size and resolution for display");
-	RNA_def_property_update(prop, 0, "rna_userdef_dpi_update");
-
-	prop = RNA_def_property(srna, "virtual_pixel_mode", PROP_ENUM, PROP_NONE);
-	RNA_def_property_enum_sdna(prop, NULL, "virtual_pixel");
-	RNA_def_property_enum_items(prop, virtual_pixel_mode_items);
-	RNA_def_property_ui_text(prop, "Virtual Pixel Mode", "Modify the pixel size for hi-res devices");
-	RNA_def_property_update(prop, 0, "rna_userdef_virtual_pixel_update");
 
 	prop = RNA_def_property(srna, "pixel_size", PROP_FLOAT, PROP_NONE);
 	RNA_def_property_clear_flag(prop, PROP_EDITABLE);
@@ -4180,6 +4160,10 @@ static void rna_def_userdef_system(BlenderRNA *brna)
 	RNA_def_property_ui_text(prop, "Selection Method",
 	                         "Use OpenGL occlusion queries or selection render mode to accelerate selection");
 
+	prop = RNA_def_property(srna, "use_select_pick_depth", PROP_BOOLEAN, PROP_NONE);
+	RNA_def_property_boolean_sdna(prop, NULL, "gpu_select_pick_deph", 1);
+	RNA_def_property_ui_text(prop, "OpenGL Depth Picking", "Use the depth buffer for picking 3D View selection");
+
 	/* Full scene anti-aliasing */
 	prop = RNA_def_property(srna, "multi_sample", PROP_ENUM, PROP_NONE);
 	RNA_def_property_enum_bitflag_sdna(prop, NULL, "ogl_multisamples");
@@ -4674,6 +4658,11 @@ void RNA_def_userdef(BlenderRNA *brna)
 	                         "Active section of the user preferences shown in the user interface");
 	RNA_def_property_update(prop, 0, "rna_userdef_update");
 
+	/* don't expose this directly via the UI, modify via an operator */
+	prop = RNA_def_property(srna, "app_template", PROP_STRING, PROP_NONE);
+	RNA_def_property_string_sdna(prop, NULL, "app_template");
+	RNA_def_property_ui_text(prop, "Application Template", "");
+
 	prop = RNA_def_property(srna, "themes", PROP_COLLECTION, PROP_NONE);
 	RNA_def_property_collection_sdna(prop, NULL, "themes", NULL);
 	RNA_def_property_struct_type(prop, "Theme");
diff --git a/source/blender/makesrna/intern/rna_wm_api.c b/source/blender/makesrna/intern/rna_wm_api.c
index 1f23ab938fb..677ea92aea1 100644
--- a/source/blender/makesrna/intern/rna_wm_api.c
+++ b/source/blender/makesrna/intern/rna_wm_api.c
@@ -87,6 +87,11 @@ static void rna_Operator_report(wmOperator *op, int type, const char *msg)
 	BKE_report(op->reports, type, msg);
 }
 
+static int rna_Operator_is_repeat(wmOperator *op, bContext *C)
+{
+	return WM_operator_is_repeat(C, op);
+}
+
 /* since event isn't needed... */
 static void rna_Operator_enum_search_invoke(bContext *C, wmOperator *op)
 {
@@ -521,6 +526,12 @@ void RNA_api_operator(StructRNA *srna)
 	parm = RNA_def_string(func, "message", NULL, 0, "Report Message", "");
 	RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
 
+	/* utility, not for registering */
+	func = RNA_def_function(srna, "is_repeat", "rna_Operator_is_repeat");
+	RNA_def_function_flag(func, FUNC_USE_CONTEXT);
+	/* return */
+	parm = RNA_def_boolean(func, "result", 0, "result", "");
+	RNA_def_function_return(func, parm);
 
 	/* Registration */
 
diff --git a/source/blender/modifiers/CMakeLists.txt b/source/blender/modifiers/CMakeLists.txt
index bacfc177432..ad2b862141c 100644
--- a/source/blender/modifiers/CMakeLists.txt
+++ b/source/blender/modifiers/CMakeLists.txt
@@ -93,6 +93,7 @@ set(SRC
 	intern/MOD_solidify.c
 	intern/MOD_subsurf.c
 	intern/MOD_surface.c
+	intern/MOD_surfacedeform.c
 	intern/MOD_triangulate.c
 	intern/MOD_util.c
 	intern/MOD_uvwarp.c
diff --git a/source/blender/modifiers/MOD_modifiertypes.h b/source/blender/modifiers/MOD_modifiertypes.h
index 4c881445893..bf121af2bd1 100644
--- a/source/blender/modifiers/MOD_modifiertypes.h
+++ b/source/blender/modifiers/MOD_modifiertypes.h
@@ -85,6 +85,7 @@ extern ModifierTypeInfo modifierType_DataTransfer;
 extern ModifierTypeInfo modifierType_NormalEdit;
 extern ModifierTypeInfo modifierType_CorrectiveSmooth;
 extern ModifierTypeInfo modifierType_MeshSequenceCache;
+extern ModifierTypeInfo modifierType_SurfaceDeform;
 
 /* MOD_util.c */
 void modifier_type_init(ModifierTypeInfo *types[]);
diff --git a/source/blender/modifiers/intern/MOD_boolean.c b/source/blender/modifiers/intern/MOD_boolean.c
index f828bc68857..f86d8b99f3c 100644
--- a/source/blender/modifiers/intern/MOD_boolean.c
+++ b/source/blender/modifiers/intern/MOD_boolean.c
@@ -319,6 +319,7 @@ static DerivedMesh *applyModifier_bmesh(
 				        use_separate,
 				        use_dissolve,
 				        use_island_connect,
+				        false,
 				        bmd->operation,
 				        bmd->double_threshold);
 
diff --git a/source/blender/modifiers/intern/MOD_displace.c b/source/blender/modifiers/intern/MOD_displace.c
index 08d118a2026..18f60bab490 100644
--- a/source/blender/modifiers/intern/MOD_displace.c
+++ b/source/blender/modifiers/intern/MOD_displace.c
@@ -38,10 +38,12 @@
 
 #include "BLI_utildefines.h"
 #include "BLI_math.h"
+#include "BLI_task.h"
 
 #include "BKE_cdderivedmesh.h"
 #include "BKE_library.h"
 #include "BKE_library_query.h"
+#include "BKE_image.h"
 #include "BKE_mesh.h"
 #include "BKE_modifier.h"
 #include "BKE_texture.h"
@@ -200,21 +202,130 @@ static void updateDepsgraph(ModifierData *md,
 	}
 }
 
+typedef struct DisplaceUserdata {
+	/*const*/ DisplaceModifierData *dmd;
+	struct ImagePool *pool;
+	MDeformVert *dvert;
+	float weight;
+	int defgrp_index;
+	int direction;
+	bool use_global_direction;
+	float (*tex_co)[3];
+	float (*vertexCos)[3];
+	float local_mat[4][4];
+	MVert *mvert;
+	float (*vert_clnors)[3];
+} DisplaceUserdata;
+
+static void displaceModifier_do_task(void *userdata, const int iter)
+{
+	DisplaceUserdata *data = (DisplaceUserdata *)userdata;
+	DisplaceModifierData *dmd = data->dmd;
+	MDeformVert *dvert = data->dvert;
+	float weight = data->weight;
+	int defgrp_index = data->defgrp_index;
+	int direction = data->direction;
+	bool use_global_direction = data->use_global_direction;
+	float (*tex_co)[3] = data->tex_co;
+	float (*vertexCos)[3] = data->vertexCos;
+	MVert *mvert = data->mvert;
+	float (*vert_clnors)[3] = data->vert_clnors;
+
+	const float delta_fixed = 1.0f - dmd->midlevel;  /* when no texture is used, we fallback to white */
+
+	TexResult texres;
+	float strength = dmd->strength;
+	float delta;
+	float local_vec[3];
+
+	if (dvert) {
+		weight = defvert_find_weight(dvert + iter, defgrp_index);
+		if (weight == 0.0f) {
+			return;
+		}
+	}
+
+	if (dmd->texture) {
+		texres.nor = NULL;
+		BKE_texture_get_value_ex(dmd->modifier.scene, dmd->texture, tex_co[iter], &texres, data->pool, false);
+		delta = texres.tin - dmd->midlevel;
+	}
+	else {
+		delta = delta_fixed;  /* (1.0f - dmd->midlevel) */  /* never changes */
+	}
+
+	if (dvert) {
+		strength *= weight;
+	}
+
+	delta *= strength;
+	CLAMP(delta, -10000, 10000);
+
+	switch (direction) {
+		case MOD_DISP_DIR_X:
+			if (use_global_direction) {
+				vertexCos[iter][0] += delta * data->local_mat[0][0];
+				vertexCos[iter][1] += delta * data->local_mat[1][0];
+				vertexCos[iter][2] += delta * data->local_mat[2][0];
+			}
+			else {
+				vertexCos[iter][0] += delta;
+			}
+			break;
+		case MOD_DISP_DIR_Y:
+			if (use_global_direction) {
+				vertexCos[iter][0] += delta * data->local_mat[0][1];
+				vertexCos[iter][1] += delta * data->local_mat[1][1];
+				vertexCos[iter][2] += delta * data->local_mat[2][1];
+			}
+			else {
+				vertexCos[iter][1] += delta;
+			}
+			break;
+		case MOD_DISP_DIR_Z:
+			if (use_global_direction) {
+				vertexCos[iter][0] += delta * data->local_mat[0][2];
+				vertexCos[iter][1] += delta * data->local_mat[1][2];
+				vertexCos[iter][2] += delta * data->local_mat[2][2];
+			}
+			else {
+				vertexCos[iter][2] += delta;
+			}
+			break;
+		case MOD_DISP_DIR_RGB_XYZ:
+			local_vec[0] = texres.tr - dmd->midlevel;
+			local_vec[1] = texres.tg - dmd->midlevel;
+			local_vec[2] = texres.tb - dmd->midlevel;
+			if (use_global_direction) {
+				mul_transposed_mat3_m4_v3(data->local_mat, local_vec);
+			}
+			mul_v3_fl(local_vec, strength);
+			add_v3_v3(vertexCos[iter], local_vec);
+			break;
+		case MOD_DISP_DIR_NOR:
+			vertexCos[iter][0] += delta * (mvert[iter].no[0] / 32767.0f);
+			vertexCos[iter][1] += delta * (mvert[iter].no[1] / 32767.0f);
+			vertexCos[iter][2] += delta * (mvert[iter].no[2] / 32767.0f);
+			break;
+		case MOD_DISP_DIR_CLNOR:
+			madd_v3_v3fl(vertexCos[iter], vert_clnors[iter], delta);
+			break;
+	}
+}
+
 /* dm must be a CDDerivedMesh */
 static void displaceModifier_do(
         DisplaceModifierData *dmd, Object *ob,
         DerivedMesh *dm, float (*vertexCos)[3], int numVerts)
 {
-	int i;
 	MVert *mvert;
 	MDeformVert *dvert;
 	int direction = dmd->direction;
 	int defgrp_index;
 	float (*tex_co)[3];
 	float weight = 1.0f; /* init value unused but some compilers may complain */
-	const float delta_fixed = 1.0f - dmd->midlevel;  /* when no texture is used, we fallback to white */
 	float (*vert_clnors)[3] = NULL;
-	float local_mat[4][4];
+	float local_mat[4][4] = {{0}};
 	const bool use_global_direction = dmd->space == MOD_DISP_SPACE_GLOBAL;
 
 	if (!dmd->texture && dmd->direction == MOD_DISP_DIR_RGB_XYZ) return;
@@ -259,81 +370,25 @@ static void displaceModifier_do(
 		copy_m4_m4(local_mat, ob->obmat);
 	}
 
-	for (i = 0; i < numVerts; i++) {
-		TexResult texres;
-		float strength = dmd->strength;
-		float delta;
-		float local_vec[3];
-
-		if (dvert) {
-			weight = defvert_find_weight(dvert + i, defgrp_index);
-			if (weight == 0.0f) continue;
-		}
-
-		if (dmd->texture) {
-			texres.nor = NULL;
-			BKE_texture_get_value(dmd->modifier.scene, dmd->texture, tex_co[i], &texres, false);
-			delta = texres.tin - dmd->midlevel;
-		}
-		else {
-			delta = delta_fixed;  /* (1.0f - dmd->midlevel) */  /* never changes */
-		}
+	DisplaceUserdata data = {NULL};
+	data.dmd = dmd;
+	data.dvert = dvert;
+	data.weight = weight;
+	data.defgrp_index = defgrp_index;
+	data.direction = direction;
+	data.use_global_direction = use_global_direction;
+	data.tex_co = tex_co;
+	data.vertexCos = vertexCos;
+	copy_m4_m4(data.local_mat, local_mat);
+	data.mvert = mvert;
+	data.vert_clnors = vert_clnors;
+	if (dmd->texture != NULL) {
+		data.pool = BKE_image_pool_new();
+	}
+	BLI_task_parallel_range(0, numVerts, &data, displaceModifier_do_task, numVerts > 512);
 
-		if (dvert) strength *= weight;
-
-		delta *= strength;
-		CLAMP(delta, -10000, 10000);
-
-		switch (direction) {
-			case MOD_DISP_DIR_X:
-				if (use_global_direction) {
-					vertexCos[i][0] += delta * local_mat[0][0];
-					vertexCos[i][1] += delta * local_mat[1][0];
-					vertexCos[i][2] += delta * local_mat[2][0];
-				}
-				else {
-					vertexCos[i][0] += delta;
-				}
-				break;
-			case MOD_DISP_DIR_Y:
-				if (use_global_direction) {
-					vertexCos[i][0] += delta * local_mat[0][1];
-					vertexCos[i][1] += delta * local_mat[1][1];
-					vertexCos[i][2] += delta * local_mat[2][1];
-				}
-				else {
-					vertexCos[i][1] += delta;
-				}
-				break;
-			case MOD_DISP_DIR_Z:
-				if (use_global_direction) {
-					vertexCos[i][0] += delta * local_mat[0][2];
-					vertexCos[i][1] += delta * local_mat[1][2];
-					vertexCos[i][2] += delta * local_mat[2][2];
-				}
-				else {
-					vertexCos[i][2] += delta;
-				}
-				break;
-			case MOD_DISP_DIR_RGB_XYZ:
-				local_vec[0] = texres.tr - dmd->midlevel;
-				local_vec[1] = texres.tg - dmd->midlevel;
-				local_vec[2] = texres.tb - dmd->midlevel;
-				if (use_global_direction) {
-					mul_transposed_mat3_m4_v3(local_mat, local_vec);
-				}
-				mul_v3_fl(local_vec, strength);
-				add_v3_v3(vertexCos[i], local_vec);
-				break;
-			case MOD_DISP_DIR_NOR:
-				vertexCos[i][0] += delta * (mvert[i].no[0] / 32767.0f);
-				vertexCos[i][1] += delta * (mvert[i].no[1] / 32767.0f);
-				vertexCos[i][2] += delta * (mvert[i].no[2] / 32767.0f);
-				break;
-			case MOD_DISP_DIR_CLNOR:
-				madd_v3_v3fl(vertexCos[i], vert_clnors[i], delta);
-				break;
-		}
+	if (data.pool != NULL) {
+		BKE_image_pool_free(data.pool);
 	}
 
 	if (tex_co) {
diff --git a/source/blender/modifiers/intern/MOD_dynamicpaint.c b/source/blender/modifiers/intern/MOD_dynamicpaint.c
index 05068b9b597..bb75d655802 100644
--- a/source/blender/modifiers/intern/MOD_dynamicpaint.c
+++ b/source/blender/modifiers/intern/MOD_dynamicpaint.c
@@ -116,7 +116,7 @@ static DerivedMesh *applyModifier(ModifierData *md, Object *ob,
 
 static bool is_brush_cb(Object *UNUSED(ob), ModifierData *pmd)
 {
-	return ((DynamicPaintModifierData*)pmd)->brush != NULL;
+	return ((DynamicPaintModifierData *)pmd)->brush != NULL;
 }
 
 static void updateDepgraph(ModifierData *md, DagForest *forest,
diff --git a/source/blender/modifiers/intern/MOD_mirror.c b/source/blender/modifiers/intern/MOD_mirror.c
index dd127cc255c..9705edc580c 100644
--- a/source/blender/modifiers/intern/MOD_mirror.c
+++ b/source/blender/modifiers/intern/MOD_mirror.c
@@ -274,8 +274,8 @@ static DerivedMesh *doMirrorOnAxis(MirrorModifierData *mmd,
 			int j = maxLoops;
 			dmloopuv += j; /* second set of loops only */
 			for (; j-- > 0; dmloopuv++) {
-				if (do_mirr_u) dmloopuv->uv[0] = 1.0f - dmloopuv->uv[0];
-				if (do_mirr_v) dmloopuv->uv[1] = 1.0f - dmloopuv->uv[1];
+				if (do_mirr_u) dmloopuv->uv[0] = 1.0f - dmloopuv->uv[0] + mmd->uv_offset[0];
+				if (do_mirr_v) dmloopuv->uv[1] = 1.0f - dmloopuv->uv[1] + mmd->uv_offset[1];
 			}
 		}
 	}
diff --git a/source/blender/modifiers/intern/MOD_surface.c b/source/blender/modifiers/intern/MOD_surface.c
index 68987a1d28e..e5b0f9c0001 100644
--- a/source/blender/modifiers/intern/MOD_surface.c
+++ b/source/blender/modifiers/intern/MOD_surface.c
@@ -172,6 +172,7 @@ ModifierTypeInfo modifierType_Surface = {
 	/* structSize */        sizeof(SurfaceModifierData),
 	/* type */              eModifierTypeType_OnlyDeform,
 	/* flags */             eModifierTypeFlag_AcceptsMesh |
+	                        eModifierTypeFlag_AcceptsCVs |
 	                        eModifierTypeFlag_NoUserAdd,
 
 	/* copyData */          NULL,
diff --git a/source/blender/modifiers/intern/MOD_surfacedeform.c b/source/blender/modifiers/intern/MOD_surfacedeform.c
new file mode 100644
index 00000000000..776cf02754e
--- /dev/null
+++ b/source/blender/modifiers/intern/MOD_surfacedeform.c
@@ -0,0 +1,1226 @@
+#include "DNA_object_types.h"
+#include "DNA_scene_types.h"
+
+#include "BLI_alloca.h"
+#include "BLI_math.h"
+#include "BLI_math_geom.h"
+#include "BLI_task.h"
+
+#include "BKE_cdderivedmesh.h"
+#include "BKE_editmesh.h"
+#include "BKE_library_query.h"
+#include "BKE_modifier.h"
+
+#include "depsgraph_private.h"
+
+#include "MEM_guardedalloc.h"
+
+#include "MOD_util.h"
+
+typedef struct SDefAdjacency {
+	struct SDefAdjacency *next;
+	unsigned int index;
+} SDefAdjacency;
+
+typedef struct SDefAdjacencyArray {
+	SDefAdjacency *first;
+	unsigned int num; /* Careful, this is twice the number of polygons (avoids an extra loop) */
+} SDefAdjacencyArray;
+
+typedef struct SDefEdgePolys {
+	unsigned int polys[2], num;
+} SDefEdgePolys;
+
+typedef struct SDefBindCalcData {
+	BVHTreeFromMesh * const treeData;
+	const SDefAdjacencyArray * const vert_edges;
+	const SDefEdgePolys * const edge_polys;
+	SDefVert * const bind_verts;
+	const MLoopTri * const looptri;
+	const MPoly * const mpoly;
+	const MEdge * const medge;
+	const MLoop * const mloop;
+	float (* const targetCos)[3];
+	float (* const vertexCos)[3];
+	float imat[4][4];
+	const float falloff;
+	int success;
+} SDefBindCalcData;
+
+typedef struct SDefBindPoly {
+	float (*coords)[3];
+	float (*coords_v2)[2];
+	float point_v2[2];
+	float weight_angular;
+	float weight_dist_proj;
+	float weight_dist;
+	float weight;
+	float scales[2];
+	float centroid[3];
+	float centroid_v2[2];
+	float normal[3];
+	float cent_edgemid_vecs_v2[2][2];
+	float edgemid_angle;
+	float point_edgemid_angles[2];
+	float corner_edgemid_angles[2];
+	float dominant_angle_weight;
+	unsigned int index;
+	unsigned int numverts;
+	unsigned int loopstart;
+	unsigned int edge_inds[2];
+	unsigned int edge_vert_inds[2];
+	unsigned int corner_ind;
+	unsigned int dominant_edge;
+	bool inside;
+} SDefBindPoly;
+
+typedef struct SDefBindWeightData {
+	SDefBindPoly *bind_polys;
+	unsigned int numpoly;
+	unsigned int numbinds;
+} SDefBindWeightData;
+
+typedef struct SDefDeformData {
+	const SDefVert * const bind_verts;
+	float (* const targetCos)[3];
+	float (* const vertexCos)[3];
+} SDefDeformData;
+
+/* Bind result values */
+enum {
+	MOD_SDEF_BIND_RESULT_SUCCESS = 1,
+	MOD_SDEF_BIND_RESULT_GENERIC_ERR = 0,
+	MOD_SDEF_BIND_RESULT_MEM_ERR = -1,
+	MOD_SDEF_BIND_RESULT_NONMANY_ERR = -2,
+	MOD_SDEF_BIND_RESULT_CONCAVE_ERR = -3,
+	MOD_SDEF_BIND_RESULT_OVERLAP_ERR = -4,
+};
+
+/* Infinite weight flags */
+enum {
+	MOD_SDEF_INFINITE_WEIGHT_ANGULAR = (1 << 0),
+	MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ = (1 << 1),
+	MOD_SDEF_INFINITE_WEIGHT_DIST = (1 << 2),
+};
+
+static void initData(ModifierData *md)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+	smd->target = NULL;
+	smd->verts = NULL;
+	smd->flags = 0;
+	smd->falloff = 4.0f;
+}
+
+static void freeData(ModifierData *md)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+	if (smd->verts) {
+		for (int i = 0; i < smd->numverts; i++) {
+			if (smd->verts[i].binds) {
+				for (int j = 0; j < smd->verts[i].numbinds; j++) {
+					MEM_SAFE_FREE(smd->verts[i].binds[j].vert_inds);
+					MEM_SAFE_FREE(smd->verts[i].binds[j].vert_weights);
+				}
+
+				MEM_freeN(smd->verts[i].binds);
+			}
+		}
+
+		MEM_freeN(smd->verts);
+		smd->verts = NULL;
+	}
+}
+
+static void copyData(ModifierData *md, ModifierData *target)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+	SurfaceDeformModifierData *tsmd = (SurfaceDeformModifierData *)target;
+
+	*tsmd = *smd;
+
+	if (smd->verts) {
+		tsmd->verts = MEM_dupallocN(smd->verts);
+
+		for (int i = 0; i < smd->numverts; i++) {
+			if (smd->verts[i].binds) {
+				tsmd->verts[i].binds = MEM_dupallocN(smd->verts[i].binds);
+
+				for (int j = 0; j < smd->verts[i].numbinds; j++) {
+					if (smd->verts[i].binds[j].vert_inds) {
+						tsmd->verts[i].binds[j].vert_inds = MEM_dupallocN(smd->verts[i].binds[j].vert_inds);
+					}
+
+					if (smd->verts[i].binds[j].vert_weights) {
+						tsmd->verts[i].binds[j].vert_weights = MEM_dupallocN(smd->verts[i].binds[j].vert_weights);
+					}
+				}
+			}
+		}
+	}
+}
+
+static void foreachObjectLink(ModifierData *md, Object *ob, ObjectWalkFunc walk, void *userData)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+	walk(userData, ob, &smd->target, IDWALK_NOP);
+}
+
+static void updateDepgraph(ModifierData *md, DagForest *forest,
+                           struct Main *UNUSED(bmain),
+                           struct Scene *UNUSED(scene),
+                           Object *UNUSED(ob),
+                           DagNode *obNode)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+	if (smd->target) {
+		DagNode *curNode = dag_get_node(forest, smd->target);
+
+		dag_add_relation(forest, curNode, obNode, DAG_RL_DATA_DATA, "Surface Deform Modifier");
+	}
+}
+
+static void updateDepsgraph(ModifierData *md,
+                            struct Main *UNUSED(bmain),
+                            struct Scene *UNUSED(scene),
+                            Object *UNUSED(ob),
+                            struct DepsNodeHandle *node)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+	if (smd->target != NULL) {
+		DEG_add_object_relation(node, smd->target, DEG_OB_COMP_GEOMETRY, "Surface Deform Modifier");
+	}
+}
+
+static void freeAdjacencyMap(SDefAdjacencyArray * const vert_edges, SDefAdjacency * const adj_ref, SDefEdgePolys * const edge_polys)
+{
+	MEM_freeN(edge_polys);
+
+	MEM_freeN(adj_ref);
+
+	MEM_freeN(vert_edges);
+}
+
+static int buildAdjacencyMap(const MPoly *poly, const MEdge *edge, const MLoop * const mloop, const unsigned int numpoly, const unsigned int numedges,
+                              SDefAdjacencyArray * const vert_edges, SDefAdjacency *adj, SDefEdgePolys * const edge_polys)
+{
+	const MLoop *loop;
+
+	/* Fing polygons adjacent to edges */
+	for (int i = 0; i < numpoly; i++, poly++) {
+		loop = &mloop[poly->loopstart];
+
+		for (int j = 0; j < poly->totloop; j++, loop++) {
+			if (edge_polys[loop->e].num == 0) {
+				edge_polys[loop->e].polys[0] = i;
+				edge_polys[loop->e].polys[1] = -1;
+				edge_polys[loop->e].num++;
+			}
+			else if (edge_polys[loop->e].num == 1) {
+				edge_polys[loop->e].polys[1] = i;
+				edge_polys[loop->e].num++;
+			}
+			else {
+				return MOD_SDEF_BIND_RESULT_NONMANY_ERR;
+			}
+		}
+	}
+
+	/* Find edges adjacent to vertices */
+	for (int i = 0; i < numedges; i++, edge++) {
+		adj->next = vert_edges[edge->v1].first;
+		adj->index = i;
+		vert_edges[edge->v1].first = adj;
+		vert_edges[edge->v1].num += edge_polys[i].num;
+		adj++;
+
+		adj->next = vert_edges[edge->v2].first;
+		adj->index = i;
+		vert_edges[edge->v2].first = adj;
+		vert_edges[edge->v2].num += edge_polys[i].num;
+		adj++;
+	}
+
+	return MOD_SDEF_BIND_RESULT_SUCCESS;
+}
+
+BLI_INLINE void sortPolyVertsEdge(unsigned int *indices, const MLoop * const mloop, const unsigned int edge, const unsigned int num)
+{
+	bool found = false;
+
+	for (int i = 0; i < num; i++) {
+		if (mloop[i].e == edge) {
+			found = true;
+		}
+		if (found) {
+			*indices = mloop[i].v;
+			indices++;
+		}
+	}
+
+	/* Fill in remaining vertex indices that occur before the edge */
+	for (int i = 0; mloop[i].e != edge; i++) {
+		*indices = mloop[i].v;
+		indices++;
+	}
+}
+
+BLI_INLINE void sortPolyVertsTri(unsigned int *indices, const MLoop * const mloop, const unsigned int loopstart, const unsigned int num)
+{
+	for (int i = loopstart; i < num; i++) {
+		*indices = mloop[i].v;
+		indices++;
+	}
+
+	for (int i = 0; i < loopstart; i++) {
+		*indices = mloop[i].v;
+		indices++;
+	}
+}
+
+BLI_INLINE unsigned int nearestVert(SDefBindCalcData * const data, const float point_co[3])
+{
+	BVHTreeNearest nearest = {.dist_sq = FLT_MAX, .index = -1};
+	const MPoly *poly;
+	const MEdge *edge;
+	const MLoop *loop;
+	float t_point[3];
+	float max_dist = FLT_MAX;
+	float dist;
+	unsigned int index = 0;
+
+	mul_v3_m4v3(t_point, data->imat, point_co);
+
+	BLI_bvhtree_find_nearest(data->treeData->tree, t_point, &nearest, data->treeData->nearest_callback, data->treeData);
+
+	poly = &data->mpoly[data->looptri[nearest.index].poly];
+	loop = &data->mloop[poly->loopstart];
+
+	for (int i = 0; i < poly->totloop; i++, loop++) {
+		edge = &data->medge[loop->e];
+		dist = dist_squared_to_line_segment_v3(point_co, data->targetCos[edge->v1], data->targetCos[edge->v2]);
+
+		if (dist < max_dist) {
+			max_dist = dist;
+			index = loop->e;
+		}
+	}
+
+	edge = &data->medge[index];
+	if (len_squared_v3v3(point_co, data->targetCos[edge->v1]) < len_squared_v3v3(point_co, data->targetCos[edge->v2])) {
+		return edge->v1;
+	}
+	else {
+		return edge->v2;
+	}
+}
+
+BLI_INLINE int isPolyValid(const float coords[][2], const unsigned int nr)
+{
+	float prev_co[2];
+	float curr_vec[2], prev_vec[2];
+
+	if (!is_poly_convex_v2(coords, nr)) {
+		return MOD_SDEF_BIND_RESULT_CONCAVE_ERR;
+	}
+
+	copy_v2_v2(prev_co, coords[nr - 1]);
+	sub_v2_v2v2(prev_vec, prev_co, coords[nr - 2]);
+
+	for (int i = 0; i < nr; i++) {
+		sub_v2_v2v2(curr_vec, coords[i], prev_co);
+
+		if (len_squared_v2(curr_vec) < FLT_EPSILON) {
+			return MOD_SDEF_BIND_RESULT_OVERLAP_ERR;
+		}
+
+		if (1.0f - dot_v2v2(prev_vec, curr_vec) < FLT_EPSILON) {
+			return MOD_SDEF_BIND_RESULT_CONCAVE_ERR;
+		}
+
+		copy_v2_v2(prev_co, coords[i]);
+		copy_v2_v2(prev_vec, curr_vec);
+	}
+
+	return MOD_SDEF_BIND_RESULT_SUCCESS;
+}
+
+static void freeBindData(SDefBindWeightData * const bwdata)
+{
+	SDefBindPoly *bpoly = bwdata->bind_polys;
+
+	if (bwdata->bind_polys) {
+		for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+			MEM_SAFE_FREE(bpoly->coords);
+			MEM_SAFE_FREE(bpoly->coords_v2);
+		}
+
+		MEM_freeN(bwdata->bind_polys);
+	}
+
+	MEM_freeN(bwdata);
+}
+
+BLI_INLINE float computeAngularWeight(const float point_angle, const float edgemid_angle)
+{
+	float weight;
+
+	weight = point_angle;
+	weight /= edgemid_angle;
+	weight *= M_PI_2;
+
+	return sinf(weight);
+}
+
+BLI_INLINE SDefBindWeightData *computeBindWeights(SDefBindCalcData * const data, const float point_co[3])
+{
+	const unsigned int nearest = nearestVert(data, point_co);
+	const SDefAdjacency * const vert_edges = data->vert_edges[nearest].first;
+	const SDefEdgePolys * const edge_polys = data->edge_polys;
+
+	const SDefAdjacency *vedge;
+	const MPoly *poly;
+	const MLoop *loop;
+
+	SDefBindWeightData *bwdata;
+	SDefBindPoly *bpoly;
+
+	float world[3] = {0.0f, 0.0f, 1.0f};
+	float avg_point_dist = 0.0f;
+	float tot_weight = 0.0f;
+	int inf_weight_flags = 0;
+
+	bwdata = MEM_callocN(sizeof(*bwdata), "SDefBindWeightData");
+	if (bwdata == NULL) {
+		data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+		return NULL;
+	}
+
+	bwdata->numpoly = data->vert_edges[nearest].num / 2;
+
+	bpoly = MEM_callocN(sizeof(*bpoly) * bwdata->numpoly, "SDefBindPoly");
+	if (bpoly == NULL) {
+		freeBindData(bwdata);
+		data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+		return NULL;
+	}
+
+	bwdata->bind_polys = bpoly;
+
+	/* Loop over all adjacent edges, and build the SDefBindPoly data for each poly adjacent to those */
+	for (vedge = vert_edges; vedge; vedge = vedge->next) {
+		unsigned int edge_ind = vedge->index;
+
+		for (int i = 0; i < edge_polys[edge_ind].num; i++) {
+			{
+				bpoly = bwdata->bind_polys;
+
+				for (int j = 0; j < bwdata->numpoly; bpoly++, j++) {
+					/* If coords isn't allocated, we have reached the first uninitialized bpoly */
+					if ((bpoly->index == edge_polys[edge_ind].polys[i]) || (!bpoly->coords)) {
+						break;
+					}
+				}
+			}
+
+			/* Check if poly was already created by another edge or still has to be initialized */
+			if (!bpoly->coords) {
+				float angle;
+				float axis[3];
+				float tmp_vec_v2[2];
+				int is_poly_valid;
+
+				bpoly->index = edge_polys[edge_ind].polys[i];
+				bpoly->coords = NULL;
+				bpoly->coords_v2 = NULL;
+
+				/* Copy poly data */
+				poly = &data->mpoly[bpoly->index];
+				loop = &data->mloop[poly->loopstart];
+
+				bpoly->numverts = poly->totloop;
+				bpoly->loopstart = poly->loopstart;
+
+				bpoly->coords = MEM_mallocN(sizeof(*bpoly->coords) * poly->totloop, "SDefBindPolyCoords");
+				if (bpoly->coords == NULL) {
+					freeBindData(bwdata);
+					data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+					return NULL;
+				}
+
+				bpoly->coords_v2 = MEM_mallocN(sizeof(*bpoly->coords_v2) * poly->totloop, "SDefBindPolyCoords_v2");
+				if (bpoly->coords_v2 == NULL) {
+					freeBindData(bwdata);
+					data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+					return NULL;
+				}
+
+				for (int j = 0; j < poly->totloop; j++, loop++) {
+					copy_v3_v3(bpoly->coords[j], data->targetCos[loop->v]);
+
+					/* Find corner and edge indices within poly loop array */
+					if (loop->v == nearest) {
+						bpoly->corner_ind = j;
+						bpoly->edge_vert_inds[0] = (j == 0) ? (poly->totloop - 1) : (j - 1);
+						bpoly->edge_vert_inds[1] = (j == poly->totloop - 1) ? (0) : (j + 1);
+
+						bpoly->edge_inds[0] = data->mloop[poly->loopstart + bpoly->edge_vert_inds[0]].e;
+						bpoly->edge_inds[1] = loop->e;
+					}
+				}
+
+				/* Compute poly's parametric data */
+				mid_v3_v3_array(bpoly->centroid, bpoly->coords, poly->totloop);
+				normal_poly_v3(bpoly->normal, bpoly->coords, poly->totloop);
+
+				/* Compute poly skew angle and axis */
+				angle = angle_normalized_v3v3(bpoly->normal, world);
+
+				cross_v3_v3v3(axis, bpoly->normal, world);
+				normalize_v3(axis);
+
+				/* Map coords onto 2d normal plane */
+				map_to_plane_axis_angle_v2_v3v3fl(bpoly->point_v2, point_co, axis, angle);
+
+				zero_v2(bpoly->centroid_v2);
+				for (int j = 0; j < poly->totloop; j++) {
+					map_to_plane_axis_angle_v2_v3v3fl(bpoly->coords_v2[j], bpoly->coords[j], axis, angle);
+					madd_v2_v2fl(bpoly->centroid_v2, bpoly->coords_v2[j], 1.0f / poly->totloop);
+				}
+
+				is_poly_valid = isPolyValid(bpoly->coords_v2, poly->totloop);
+
+				if (is_poly_valid != MOD_SDEF_BIND_RESULT_SUCCESS) {
+					freeBindData(bwdata);
+					data->success = is_poly_valid;
+					return NULL;
+				}
+
+				bpoly->inside = isect_point_poly_v2(bpoly->point_v2, bpoly->coords_v2, poly->totloop, false);
+
+				/* Initialize weight components */
+				bpoly->weight_angular = 1.0f;
+				bpoly->weight_dist_proj = len_v2v2(bpoly->centroid_v2, bpoly->point_v2);
+				bpoly->weight_dist = len_v3v3(bpoly->centroid, point_co);
+
+				avg_point_dist += bpoly->weight_dist;
+
+				/* Compute centroid to mid-edge vectors */
+				mid_v2_v2v2(bpoly->cent_edgemid_vecs_v2[0],
+				            bpoly->coords_v2[bpoly->edge_vert_inds[0]],
+				            bpoly->coords_v2[bpoly->corner_ind]);
+
+				mid_v2_v2v2(bpoly->cent_edgemid_vecs_v2[1],
+				            bpoly->coords_v2[bpoly->edge_vert_inds[1]],
+				            bpoly->coords_v2[bpoly->corner_ind]);
+
+				sub_v2_v2(bpoly->cent_edgemid_vecs_v2[0], bpoly->centroid_v2);
+				sub_v2_v2(bpoly->cent_edgemid_vecs_v2[1], bpoly->centroid_v2);
+
+				/* Compute poly scales with respect to mid-edges, and normalize the vectors */
+				bpoly->scales[0] = normalize_v2(bpoly->cent_edgemid_vecs_v2[0]);
+				bpoly->scales[1] = normalize_v2(bpoly->cent_edgemid_vecs_v2[1]);
+
+				/* Compute the required polygon angles */
+				bpoly->edgemid_angle = angle_normalized_v2v2(bpoly->cent_edgemid_vecs_v2[0], bpoly->cent_edgemid_vecs_v2[1]);
+
+				sub_v2_v2v2(tmp_vec_v2, bpoly->coords_v2[bpoly->corner_ind], bpoly->centroid_v2);
+				normalize_v2(tmp_vec_v2);
+
+				bpoly->corner_edgemid_angles[0] = angle_normalized_v2v2(tmp_vec_v2, bpoly->cent_edgemid_vecs_v2[0]);
+				bpoly->corner_edgemid_angles[1] = angle_normalized_v2v2(tmp_vec_v2, bpoly->cent_edgemid_vecs_v2[1]);
+
+				/* Check for inifnite weights, and compute angular data otherwise */
+				if (bpoly->weight_dist < FLT_EPSILON) {
+					inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ;
+					inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST;
+				}
+				else if (bpoly->weight_dist_proj < FLT_EPSILON) {
+					inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ;
+				}
+				else {
+					float cent_point_vec[2];
+
+					sub_v2_v2v2(cent_point_vec, bpoly->point_v2, bpoly->centroid_v2);
+					normalize_v2(cent_point_vec);
+
+					bpoly->point_edgemid_angles[0] = angle_normalized_v2v2(cent_point_vec, bpoly->cent_edgemid_vecs_v2[0]);
+					bpoly->point_edgemid_angles[1] = angle_normalized_v2v2(cent_point_vec, bpoly->cent_edgemid_vecs_v2[1]);
+				}
+			}
+		}
+	}
+
+	avg_point_dist /= bwdata->numpoly;
+
+	/* If weights 1 and 2 are not infinite, loop over all adjacent edges again,
+	 * and build adjacency dependent angle data (depends on all polygons having been computed) */
+	if (!inf_weight_flags) {
+		for (vedge = vert_edges; vedge; vedge = vedge->next) {
+			SDefBindPoly *bpolys[2];
+			const SDefEdgePolys *epolys;
+			float ang_weights[2];
+			unsigned int edge_ind = vedge->index;
+			unsigned int edge_on_poly[2];
+
+			epolys = &edge_polys[edge_ind];
+
+			/* Find bind polys corresponding to the edge's adjacent polys */
+			bpoly = bwdata->bind_polys;
+
+			for (int i = 0, j = 0; (i < bwdata->numpoly) && (j < epolys->num); bpoly++, i++) {
+				if (ELEM(bpoly->index, epolys->polys[0], epolys->polys[1])) {
+					bpolys[j] = bpoly;
+
+					if (bpoly->edge_inds[0] == edge_ind) {
+						edge_on_poly[j] = 0;
+					}
+					else {
+						edge_on_poly[j] = 1;
+					}
+
+					j++;
+				}
+			}
+
+			/* Compute angular weight component */
+			if (epolys->num == 1) {
+				ang_weights[0] = computeAngularWeight(bpolys[0]->point_edgemid_angles[edge_on_poly[0]], bpolys[0]->edgemid_angle);
+				bpolys[0]->weight_angular *= ang_weights[0] * ang_weights[0];
+			}
+			else if (epolys->num == 2) {
+				ang_weights[0] = computeAngularWeight(bpolys[0]->point_edgemid_angles[edge_on_poly[0]], bpolys[0]->edgemid_angle);
+				ang_weights[1] = computeAngularWeight(bpolys[1]->point_edgemid_angles[edge_on_poly[1]], bpolys[1]->edgemid_angle);
+
+				bpolys[0]->weight_angular *= ang_weights[0] * ang_weights[1];
+				bpolys[1]->weight_angular *= ang_weights[0] * ang_weights[1];
+			}
+		}
+	}
+
+	/* Compute scalings and falloff.
+	 * Scale all weights if no infinite weight is found,
+	 * scale only unprojected weight if projected weight is infinite,
+	 * scale none if both are infinite. */
+	if (!inf_weight_flags) {
+		bpoly = bwdata->bind_polys;
+
+		for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+			float corner_angle_weights[2];
+			float scale_weight, sqr, inv_sqr;
+
+			corner_angle_weights[0] = bpoly->point_edgemid_angles[0] / bpoly->corner_edgemid_angles[0];
+			corner_angle_weights[1] = bpoly->point_edgemid_angles[1] / bpoly->corner_edgemid_angles[1];
+
+			if (isnan(corner_angle_weights[0]) || isnan(corner_angle_weights[1])) {
+				freeBindData(bwdata);
+				data->success = MOD_SDEF_BIND_RESULT_GENERIC_ERR;
+				return NULL;
+			}
+
+			/* Find which edge the point is closer to */
+			if (corner_angle_weights[0] < corner_angle_weights[1]) {
+				bpoly->dominant_edge = 0;
+				bpoly->dominant_angle_weight = corner_angle_weights[0];
+			}
+			else {
+				bpoly->dominant_edge = 1;
+				bpoly->dominant_angle_weight = corner_angle_weights[1];
+			}
+
+			bpoly->dominant_angle_weight = sinf(bpoly->dominant_angle_weight * M_PI_2);
+
+			/* Compute quadratic angular scale interpolation weight */
+			scale_weight = bpoly->point_edgemid_angles[bpoly->dominant_edge] / bpoly->edgemid_angle;
+			scale_weight /= scale_weight + (bpoly->point_edgemid_angles[!bpoly->dominant_edge] / bpoly->edgemid_angle);
+
+			sqr = scale_weight * scale_weight;
+			inv_sqr = 1.0f - scale_weight;
+			inv_sqr *= inv_sqr;
+			scale_weight = sqr / (sqr + inv_sqr);
+
+			/* Compute interpolated scale (no longer need the individual scales,
+			 * so simply storing the result over the scale in index zero) */
+			bpoly->scales[0] = bpoly->scales[bpoly->dominant_edge] * (1.0f - scale_weight) +
+			                   bpoly->scales[!bpoly->dominant_edge] * scale_weight;
+
+			/* Scale the point distance weights, and introduce falloff */
+			bpoly->weight_dist_proj /= bpoly->scales[0];
+			bpoly->weight_dist_proj = powf(bpoly->weight_dist_proj, data->falloff);
+
+			bpoly->weight_dist /= avg_point_dist;
+			bpoly->weight_dist = powf(bpoly->weight_dist, data->falloff);
+
+			/* Re-check for infinite weights, now that all scalings and interpolations are computed */
+			if (bpoly->weight_dist < FLT_EPSILON) {
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ;
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST;
+			}
+			else if (bpoly->weight_dist_proj < FLT_EPSILON) {
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ;
+			}
+			else if (bpoly->weight_angular < FLT_EPSILON) {
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_ANGULAR;
+			}
+		}
+	}
+	else if (!(inf_weight_flags & MOD_SDEF_INFINITE_WEIGHT_DIST)) {
+		bpoly = bwdata->bind_polys;
+
+		for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+			/* Scale the point distance weight by average point distance, and introduce falloff */
+			bpoly->weight_dist /= avg_point_dist;
+			bpoly->weight_dist = powf(bpoly->weight_dist, data->falloff);
+
+			/* Re-check for infinite weights, now that all scalings and interpolations are computed */
+			if (bpoly->weight_dist < FLT_EPSILON) {
+				inf_weight_flags |= MOD_SDEF_INFINITE_WEIGHT_DIST;
+			}
+		}
+	}
+
+	/* Final loop, to compute actual weights */
+	bpoly = bwdata->bind_polys;
+
+	for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+		/* Weight computation from components */
+		if (inf_weight_flags & MOD_SDEF_INFINITE_WEIGHT_DIST) {
+			bpoly->weight = bpoly->weight_dist < FLT_EPSILON ? 1.0f : 0.0f;
+		}
+		else if (inf_weight_flags & MOD_SDEF_INFINITE_WEIGHT_DIST_PROJ) {
+			bpoly->weight = bpoly->weight_dist_proj < FLT_EPSILON ?
+			                1.0f / bpoly->weight_dist : 0.0f;
+		}
+		else if (inf_weight_flags & MOD_SDEF_INFINITE_WEIGHT_ANGULAR) {
+			bpoly->weight = bpoly->weight_angular < FLT_EPSILON ?
+			                1.0f / bpoly->weight_dist_proj / bpoly->weight_dist : 0.0f;
+		}
+		else {
+			bpoly->weight = 1.0f / bpoly->weight_angular /
+			                       bpoly->weight_dist_proj /
+			                       bpoly->weight_dist;
+		}
+
+		tot_weight += bpoly->weight;
+	}
+
+	bpoly = bwdata->bind_polys;
+
+	for (int i = 0; i < bwdata->numpoly; bpoly++, i++) {
+		bpoly->weight /= tot_weight;
+
+		/* Evaluate if this poly is relevant to bind */
+		/* Even though the weights should add up to 1.0,
+		 * the losses of weights smaller than epsilon here
+		 * should be negligible... */
+		if (bpoly->weight >= FLT_EPSILON) {
+			if (bpoly->inside) {
+				bwdata->numbinds += 1;
+			}
+			else {
+				if (bpoly->dominant_angle_weight < FLT_EPSILON || 1.0f - bpoly->dominant_angle_weight < FLT_EPSILON) {
+					bwdata->numbinds += 1;
+				}
+				else {
+					bwdata->numbinds += 2;
+				}
+			}
+		}
+	}
+
+	return bwdata;
+}
+
+BLI_INLINE float computeNormalDisplacement(const float point_co[3], const float point_co_proj[3], const float normal[3])
+{
+	float disp_vec[3];
+	float normal_dist;
+
+	sub_v3_v3v3(disp_vec, point_co, point_co_proj);
+	normal_dist = len_v3(disp_vec);
+
+	if (dot_v3v3(disp_vec, normal) < 0) {
+		normal_dist *= -1;
+	}
+
+	return normal_dist;
+}
+
+static void bindVert(void *userdata, void *UNUSED(userdata_chunk), const int index, const int UNUSED(threadid))
+{
+	SDefBindCalcData * const data = (SDefBindCalcData *)userdata;
+	float point_co[3];
+	float point_co_proj[3];
+
+	SDefBindWeightData *bwdata;
+	SDefVert *sdvert = data->bind_verts + index;
+	SDefBindPoly *bpoly;
+	SDefBind *sdbind;
+
+	if (data->success != MOD_SDEF_BIND_RESULT_SUCCESS) {
+		sdvert->binds = NULL;
+		sdvert->numbinds = 0;
+		return;
+	}
+
+	copy_v3_v3(point_co, data->vertexCos[index]);
+	bwdata = computeBindWeights(data, point_co);
+
+	if (bwdata == NULL) {
+		sdvert->binds = NULL;
+		sdvert->numbinds = 0;
+		return;
+	}
+
+	sdvert->binds = MEM_callocN(sizeof(*sdvert->binds) * bwdata->numbinds, "SDefVertBindData");
+	if (sdvert->binds == NULL) {
+		data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+		sdvert->numbinds = 0;
+		return;
+	}
+
+	sdvert->numbinds = bwdata->numbinds;
+
+	sdbind = sdvert->binds;
+
+	bpoly = bwdata->bind_polys;
+
+	for (int i = 0; i < bwdata->numbinds; bpoly++) {
+		if (bpoly->weight >= FLT_EPSILON) {
+			if (bpoly->inside) {
+				const MLoop *loop = &data->mloop[bpoly->loopstart];
+
+				sdbind->influence = bpoly->weight;
+				sdbind->numverts = bpoly->numverts;
+
+				sdbind->mode = MOD_SDEF_MODE_NGON;
+				sdbind->vert_weights = MEM_mallocN(sizeof(*sdbind->vert_weights) * bpoly->numverts, "SDefNgonVertWeights");
+				if (sdbind->vert_weights == NULL) {
+					data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+					return;
+				}
+
+				sdbind->vert_inds = MEM_mallocN(sizeof(*sdbind->vert_inds) * bpoly->numverts, "SDefNgonVertInds");
+				if (sdbind->vert_inds == NULL) {
+					data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+					return;
+				}
+
+				interp_weights_poly_v2(sdbind->vert_weights, bpoly->coords_v2, bpoly->numverts, bpoly->point_v2);
+
+				/* Reproject vert based on weights and original poly verts, to reintroduce poly non-planarity */
+				zero_v3(point_co_proj);
+				for (int j = 0; j < bpoly->numverts; j++, loop++) {
+					madd_v3_v3fl(point_co_proj, bpoly->coords[j], sdbind->vert_weights[j]);
+					sdbind->vert_inds[j] = loop->v;
+				}
+
+				sdbind->normal_dist = computeNormalDisplacement(point_co, point_co_proj, bpoly->normal);
+
+				sdbind++;
+				i++;
+			}
+			else {
+				float tmp_vec[3];
+				float cent[3], norm[3];
+				float v1[3], v2[3], v3[3];
+
+				if (1.0f - bpoly->dominant_angle_weight >= FLT_EPSILON) {
+					sdbind->influence = bpoly->weight * (1.0f - bpoly->dominant_angle_weight);
+					sdbind->numverts = bpoly->numverts;
+
+					sdbind->mode = MOD_SDEF_MODE_CENTROID;
+					sdbind->vert_weights = MEM_mallocN(sizeof(*sdbind->vert_weights) * 3, "SDefCentVertWeights");
+					if (sdbind->vert_weights == NULL) {
+						data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+						return;
+					}
+
+					sdbind->vert_inds = MEM_mallocN(sizeof(*sdbind->vert_inds) * bpoly->numverts, "SDefCentVertInds");
+					if (sdbind->vert_inds == NULL) {
+						data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+						return;
+					}
+
+					sortPolyVertsEdge(sdbind->vert_inds, &data->mloop[bpoly->loopstart],
+					                  bpoly->edge_inds[bpoly->dominant_edge], bpoly->numverts);
+
+					copy_v3_v3(v1, data->targetCos[sdbind->vert_inds[0]]);
+					copy_v3_v3(v2, data->targetCos[sdbind->vert_inds[1]]);
+					copy_v3_v3(v3, bpoly->centroid);
+
+					mid_v3_v3v3v3(cent, v1, v2, v3);
+					normal_tri_v3(norm, v1, v2, v3);
+
+					add_v3_v3v3(tmp_vec, point_co, bpoly->normal);
+
+					/* We are sure the line is not parallel to the plane.
+					 * Checking return value just to avoid warning... */
+					if (!isect_line_plane_v3(point_co_proj, point_co, tmp_vec, cent, norm)) {
+						BLI_assert(false);
+					}
+
+					interp_weights_tri_v3(sdbind->vert_weights, v1, v2, v3, point_co_proj);
+
+					sdbind->normal_dist = computeNormalDisplacement(point_co, point_co_proj, bpoly->normal);
+
+					sdbind++;
+					i++;
+				}
+
+				if (bpoly->dominant_angle_weight >= FLT_EPSILON) {
+					sdbind->influence = bpoly->weight * bpoly->dominant_angle_weight;
+					sdbind->numverts = bpoly->numverts;
+
+					sdbind->mode = MOD_SDEF_MODE_LOOPTRI;
+					sdbind->vert_weights = MEM_mallocN(sizeof(*sdbind->vert_weights) * 3, "SDefTriVertWeights");
+					if (sdbind->vert_weights == NULL) {
+						data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+						return;
+					}
+
+					sdbind->vert_inds = MEM_mallocN(sizeof(*sdbind->vert_inds) * bpoly->numverts, "SDefTriVertInds");
+					if (sdbind->vert_inds == NULL) {
+						data->success = MOD_SDEF_BIND_RESULT_MEM_ERR;
+						return;
+					}
+
+					sortPolyVertsTri(sdbind->vert_inds, &data->mloop[bpoly->loopstart], bpoly->edge_vert_inds[0], bpoly->numverts);
+
+					copy_v3_v3(v1, data->targetCos[sdbind->vert_inds[0]]);
+					copy_v3_v3(v2, data->targetCos[sdbind->vert_inds[1]]);
+					copy_v3_v3(v3, data->targetCos[sdbind->vert_inds[2]]);
+
+					mid_v3_v3v3v3(cent, v1, v2, v3);
+					normal_tri_v3(norm, v1, v2, v3);
+
+					add_v3_v3v3(tmp_vec, point_co, bpoly->normal);
+
+					/* We are sure the line is not parallel to the plane.
+					 * Checking return value just to avoid warning... */
+					if (!isect_line_plane_v3(point_co_proj, point_co, tmp_vec, cent, norm)) {
+						BLI_assert(false);
+					}
+
+					interp_weights_tri_v3(sdbind->vert_weights, v1, v2, v3, point_co_proj);
+
+					sdbind->normal_dist = computeNormalDisplacement(point_co, point_co_proj, bpoly->normal);
+
+					sdbind++;
+					i++;
+				}
+			}
+		}
+	}
+
+	freeBindData(bwdata);
+}
+
+static bool surfacedeformBind(SurfaceDeformModifierData *smd, float (*vertexCos)[3],
+                              unsigned int numverts, unsigned int tnumpoly, unsigned int tnumverts, DerivedMesh *tdm)
+{
+	BVHTreeFromMesh treeData = {NULL};
+	const MVert *mvert = tdm->getVertArray(tdm);
+	const MPoly *mpoly = tdm->getPolyArray(tdm);
+	const MEdge *medge = tdm->getEdgeArray(tdm);
+	const MLoop *mloop = tdm->getLoopArray(tdm);
+	unsigned int tnumedges = tdm->getNumEdges(tdm);
+	int adj_result;
+	SDefAdjacencyArray *vert_edges;
+	SDefAdjacency *adj_array;
+	SDefEdgePolys *edge_polys;
+
+	vert_edges = MEM_callocN(sizeof(*vert_edges) * tnumverts, "SDefVertEdgeMap");
+	if (vert_edges == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		return false;
+	}
+
+	adj_array = MEM_mallocN(sizeof(*adj_array) * tnumedges * 2, "SDefVertEdge");
+	if (adj_array == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		MEM_freeN(vert_edges);
+		return false;
+	}
+
+	edge_polys = MEM_callocN(sizeof(*edge_polys) * tnumedges, "SDefEdgeFaceMap");
+	if (edge_polys == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		MEM_freeN(vert_edges);
+		MEM_freeN(adj_array);
+		return false;
+	}
+
+	smd->verts = MEM_mallocN(sizeof(*smd->verts) * numverts, "SDefBindVerts");
+	if (smd->verts == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		freeAdjacencyMap(vert_edges, adj_array, edge_polys);
+		return false;
+	}
+
+	bvhtree_from_mesh_looptri(&treeData, tdm, 0.0, 2, 6);
+	if (treeData.tree == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		freeAdjacencyMap(vert_edges, adj_array, edge_polys);
+		MEM_freeN(smd->verts);
+		smd->verts = NULL;
+		return false;
+	}
+
+	adj_result = buildAdjacencyMap(mpoly, medge, mloop, tnumpoly, tnumedges, vert_edges, adj_array, edge_polys);
+
+	if (adj_result == MOD_SDEF_BIND_RESULT_NONMANY_ERR) {
+		modifier_setError((ModifierData *)smd, "Target has edges with more than two polygons");
+		freeAdjacencyMap(vert_edges, adj_array, edge_polys);
+		free_bvhtree_from_mesh(&treeData);
+		MEM_freeN(smd->verts);
+		smd->verts = NULL;
+		return false;
+	}
+
+	smd->numverts = numverts;
+	smd->numpoly = tnumpoly;
+
+	SDefBindCalcData data = {.treeData = &treeData,
+		                     .vert_edges = vert_edges,
+		                     .edge_polys = edge_polys,
+		                     .mpoly = mpoly,
+		                     .medge = medge,
+		                     .mloop = mloop,
+		                     .looptri = tdm->getLoopTriArray(tdm),
+		                     .targetCos = MEM_mallocN(sizeof(float[3]) * tnumverts, "SDefTargetBindVertArray"),
+		                     .bind_verts = smd->verts,
+		                     .vertexCos = vertexCos,
+		                     .falloff = smd->falloff,
+		                     .success = MOD_SDEF_BIND_RESULT_SUCCESS};
+
+	if (data.targetCos == NULL) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		freeData((ModifierData *)smd);
+		return false;
+	}
+
+	invert_m4_m4(data.imat, smd->mat);
+
+	for (int i = 0; i < tnumverts; i++) {
+		mul_v3_m4v3(data.targetCos[i], smd->mat, mvert[i].co);
+	}
+
+	BLI_task_parallel_range_ex(0, numverts, &data, NULL, 0, bindVert,
+	                           numverts > 10000, false);
+
+	MEM_freeN(data.targetCos);
+
+	if (data.success == MOD_SDEF_BIND_RESULT_MEM_ERR) {
+		modifier_setError((ModifierData *)smd, "Out of memory");
+		freeData((ModifierData *)smd);
+	}
+	else if (data.success == MOD_SDEF_BIND_RESULT_NONMANY_ERR) {
+		modifier_setError((ModifierData *)smd, "Target has edges with more than two polygons");
+		freeData((ModifierData *)smd);
+	}
+	else if (data.success == MOD_SDEF_BIND_RESULT_CONCAVE_ERR) {
+		modifier_setError((ModifierData *)smd, "Target contains concave polygons");
+		freeData((ModifierData *)smd);
+	}
+	else if (data.success == MOD_SDEF_BIND_RESULT_OVERLAP_ERR) {
+		modifier_setError((ModifierData *)smd, "Target contains overlapping verts");
+		freeData((ModifierData *)smd);
+	}
+	else if (data.success == MOD_SDEF_BIND_RESULT_GENERIC_ERR) {
+		/* I know this message is vague, but I could not think of a way
+		 * to explain this whith a reasonably sized message.
+		 * Though it shouldn't really matter all that much,
+		 * because this is very unlikely to occur */
+		modifier_setError((ModifierData *)smd, "Target contains invalid polygons");
+		freeData((ModifierData *)smd);
+	}
+
+	freeAdjacencyMap(vert_edges, adj_array, edge_polys);
+	free_bvhtree_from_mesh(&treeData);
+
+	return data.success == 1;
+}
+
+static void deformVert(void *userdata, void *UNUSED(userdata_chunk), const int index, const int UNUSED(threadid))
+{
+	const SDefDeformData * const data = (SDefDeformData *)userdata;
+	const SDefBind *sdbind = data->bind_verts[index].binds;
+	float * const vertexCos = data->vertexCos[index];
+	float norm[3], temp[3];
+
+	zero_v3(vertexCos);
+
+	for (int j = 0; j < data->bind_verts[index].numbinds; j++, sdbind++) {
+		/* Mode-generic operations (allocate poly coordinates) */
+		float (*coords)[3] = MEM_mallocN(sizeof(*coords) * sdbind->numverts, "SDefDoPolyCoords");
+
+		for (int k = 0; k < sdbind->numverts; k++) {
+			copy_v3_v3(coords[k], data->targetCos[sdbind->vert_inds[k]]);
+		}
+
+		normal_poly_v3(norm, coords, sdbind->numverts);
+		zero_v3(temp);
+
+		/* ---------- looptri mode ---------- */
+		if (sdbind->mode == MOD_SDEF_MODE_LOOPTRI) {
+			madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[0]], sdbind->vert_weights[0]);
+			madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[1]], sdbind->vert_weights[1]);
+			madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[2]], sdbind->vert_weights[2]);
+		}
+		else {
+			/* ---------- ngon mode ---------- */
+			if (sdbind->mode == MOD_SDEF_MODE_NGON) {
+				for (int k = 0; k < sdbind->numverts; k++) {
+					madd_v3_v3fl(temp, coords[k], sdbind->vert_weights[k]);
+				}
+			}
+
+			/* ---------- centroid mode ---------- */
+			else if (sdbind->mode == MOD_SDEF_MODE_CENTROID) {
+				float cent[3];
+				mid_v3_v3_array(cent, coords, sdbind->numverts);
+
+				madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[0]], sdbind->vert_weights[0]);
+				madd_v3_v3fl(temp, data->targetCos[sdbind->vert_inds[1]], sdbind->vert_weights[1]);
+				madd_v3_v3fl(temp, cent, sdbind->vert_weights[2]);
+			}
+		}
+
+		MEM_freeN(coords);
+
+		/* Apply normal offset (generic for all modes) */
+		madd_v3_v3fl(temp, norm, sdbind->normal_dist);
+
+		madd_v3_v3fl(vertexCos, temp, sdbind->influence);
+	}
+}
+
+static void surfacedeformModifier_do(ModifierData *md, float (*vertexCos)[3], unsigned int numverts, Object *ob)
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+	DerivedMesh *tdm;
+	unsigned int tnumverts, tnumpoly;
+
+	/* Exit function if bind flag is not set (free bind data if any) */
+	if (!(smd->flags & MOD_SDEF_BIND)) {
+		freeData(md);
+		return;
+	}
+
+	/* Handle target mesh both in and out of edit mode */
+	if (smd->target == md->scene->obedit) {
+		BMEditMesh *em = BKE_editmesh_from_object(smd->target);
+		tdm = em->derivedFinal;
+	}
+	else {
+		tdm = smd->target->derivedFinal;
+	}
+
+	tnumverts = tdm->getNumVerts(tdm);
+	tnumpoly = tdm->getNumPolys(tdm);
+
+	/* If not bound, execute bind */
+	if (!(smd->verts)) {
+		float tmp_mat[4][4];
+
+		invert_m4_m4(tmp_mat, ob->obmat);
+		mul_m4_m4m4(smd->mat, tmp_mat, smd->target->obmat);
+
+		if (!surfacedeformBind(smd, vertexCos, numverts, tnumpoly, tnumverts, tdm)) {
+			smd->flags &= ~MOD_SDEF_BIND;
+			return;
+		}
+	}
+
+	/* Poly count checks */
+	if (smd->numverts != numverts) {
+		modifier_setError(md, "Verts changed from %u to %u", smd->numverts, numverts);
+		tdm->release(tdm);
+		return;
+	}
+	else if (smd->numpoly != tnumpoly) {
+		modifier_setError(md, "Target polygons changed from %u to %u", smd->numpoly, tnumpoly);
+		tdm->release(tdm);
+		return;
+	}
+
+	/* Actual vertex location update starts here */
+	SDefDeformData data = {.bind_verts = smd->verts,
+		                   .targetCos = MEM_mallocN(sizeof(float[3]) * tnumverts, "SDefTargetVertArray"),
+		                   .vertexCos = vertexCos};
+
+	if (data.targetCos != NULL) {
+		bool tdm_vert_alloc;
+		const MVert * const mvert = DM_get_vert_array(tdm, &tdm_vert_alloc);
+
+		for (int i = 0; i < tnumverts; i++) {
+			mul_v3_m4v3(data.targetCos[i], smd->mat, mvert[i].co);
+		}
+
+		BLI_task_parallel_range_ex(0, numverts, &data, NULL, 0, deformVert,
+		                           numverts > 10000, false);
+
+		if (tdm_vert_alloc) {
+			MEM_freeN((void *)mvert);
+		}
+
+		MEM_freeN(data.targetCos);
+	}
+
+	tdm->release(tdm);
+}
+
+static void deformVerts(ModifierData *md, Object *ob,
+                        DerivedMesh *UNUSED(derivedData),
+                        float (*vertexCos)[3], int numVerts,
+                        ModifierApplyFlag UNUSED(flag))
+{
+	surfacedeformModifier_do(md, vertexCos, numVerts, ob);
+}
+
+static void deformVertsEM(ModifierData *md, Object *ob,
+                          struct BMEditMesh *UNUSED(editData),
+                          DerivedMesh *UNUSED(derivedData),
+                          float (*vertexCos)[3], int numVerts)
+{
+	surfacedeformModifier_do(md, vertexCos, numVerts, ob);
+}
+
+static bool isDisabled(ModifierData *md, int UNUSED(useRenderParams))
+{
+	SurfaceDeformModifierData *smd = (SurfaceDeformModifierData *)md;
+
+	return !smd->target;
+}
+
+ModifierTypeInfo modifierType_SurfaceDeform = {
+	/* name */              "Surface Deform",
+	/* structName */        "SurfaceDeformModifierData",
+	/* structSize */        sizeof(SurfaceDeformModifierData),
+	/* type */              eModifierTypeType_OnlyDeform,
+	/* flags */             eModifierTypeFlag_AcceptsMesh |
+	                        eModifierTypeFlag_SupportsEditmode,
+
+	/* copyData */          copyData,
+	/* deformVerts */       deformVerts,
+	/* deformMatrices */    NULL,
+	/* deformVertsEM */     deformVertsEM,
+	/* deformMatricesEM */  NULL,
+	/* applyModifier */     NULL,
+	/* applyModifierEM */   NULL,
+	/* initData */          initData,
+	/* requiredDataMask */  NULL,
+	/* freeData */          freeData,
+	/* isDisabled */        isDisabled,
+	/* updateDepgraph */    updateDepgraph,
+	/* updateDepsgraph */   updateDepsgraph,
+	/* dependsOnTime */     NULL,
+	/* dependsOnNormals */  NULL,
+	/* foreachObjectLink */ foreachObjectLink,
+	/* foreachIDLink */     NULL,
+	/* foreachTexLink */    NULL,
+};
diff --git a/source/blender/modifiers/intern/MOD_util.c b/source/blender/modifiers/intern/MOD_util.c
index 93414562ccf..ded1f0b77e6 100644
--- a/source/blender/modifiers/intern/MOD_util.c
+++ b/source/blender/modifiers/intern/MOD_util.c
@@ -287,5 +287,6 @@ void modifier_type_init(ModifierTypeInfo *types[])
 	INIT_TYPE(NormalEdit);
 	INIT_TYPE(CorrectiveSmooth);
 	INIT_TYPE(MeshSequenceCache);
+	INIT_TYPE(SurfaceDeform);
 #undef INIT_TYPE
 }
diff --git a/source/blender/nodes/composite/nodes/node_composite_glare.c b/source/blender/nodes/composite/nodes/node_composite_glare.c
index c512ea49586..76020e55463 100644
--- a/source/blender/nodes/composite/nodes/node_composite_glare.c
+++ b/source/blender/nodes/composite/nodes/node_composite_glare.c
@@ -50,7 +50,8 @@ static void node_composit_init_glare(bNodeTree *UNUSED(ntree), bNode *node)
 	ndg->colmod = 0.25;
 	ndg->mix = 0;
 	ndg->threshold = 1;
-	ndg->angle = 4;
+	ndg->star_45 = true;
+	ndg->streaks = 4;
 	ndg->angle_ofs = 0.0f;
 	ndg->fade = 0.9;
 	ndg->size = 8;
diff --git a/source/blender/nodes/shader/nodes/node_shader_fresnel.c b/source/blender/nodes/shader/nodes/node_shader_fresnel.c
index d5e11795fc0..5a9e33a4053 100644
--- a/source/blender/nodes/shader/nodes/node_shader_fresnel.c
+++ b/source/blender/nodes/shader/nodes/node_shader_fresnel.c
@@ -64,10 +64,11 @@ static void node_shader_exec_fresnel(void *data, int UNUSED(thread), bNode *UNUS
 		copy_v3_v3(n, shi->vn);
 	}
 
-	if(shi->use_world_space_shading)
+	if (shi->use_world_space_shading) {
 		mul_mat3_m4_v3((float (*)[4])RE_render_current_get_matrix(RE_VIEW_MATRIX), n);
+	}
 
-	out[0]->vec[0] = RE_fresnel_dielectric(shi->view, n, shi->flippednor ? 1/eta : eta);
+	out[0]->vec[0] = RE_fresnel_dielectric(shi->view, n, shi->flippednor ? 1 / eta : eta);
 }
 
 /* node type definition */
diff --git a/source/blender/nodes/shader/nodes/node_shader_layer_weight.c b/source/blender/nodes/shader/nodes/node_shader_layer_weight.c
index 90e2625b961..a0b2408a7bb 100644
--- a/source/blender/nodes/shader/nodes/node_shader_layer_weight.c
+++ b/source/blender/nodes/shader/nodes/node_shader_layer_weight.c
@@ -69,7 +69,7 @@ static void node_shader_exec_layer_weight(void *data, int UNUSED(thread), bNode
 	if (shi->use_world_space_shading)
 		mul_mat3_m4_v3((float (*)[4])RE_render_current_get_matrix(RE_VIEW_MATRIX), n);
 
-	out[0]->vec[0] = RE_fresnel_dielectric(shi->view, n, shi->flippednor ? eta : 1/eta);
+	out[0]->vec[0] = RE_fresnel_dielectric(shi->view, n, shi->flippednor ? eta : 1 / eta);
 
 	float facing = fabs(dot_v3v3(shi->view, n));
 	if (blend != 0.5) {
diff --git a/source/blender/nodes/shader/nodes/node_shader_normal_map.c b/source/blender/nodes/shader/nodes/node_shader_normal_map.c
index 48d1688c386..e0bf34f42e4 100644
--- a/source/blender/nodes/shader/nodes/node_shader_normal_map.c
+++ b/source/blender/nodes/shader/nodes/node_shader_normal_map.c
@@ -63,7 +63,7 @@ static void node_shader_exec_normal_map(void *data, int UNUSED(thread), bNode *n
 
 		CLAMP_MIN(strength, 0.0f);
 
-		float *N = shi->vno;
+		float *N = shi->nmapnorm;
 		int uv_index = 0;
 		switch (nm->space) {
 			case SHD_NORMAL_MAP_TANGENT:
diff --git a/source/blender/nodes/shader/nodes/node_shader_tex_brick.c b/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
index 0be47c4f751..1dfebc45d60 100644
--- a/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
+++ b/source/blender/nodes/shader/nodes/node_shader_tex_brick.c
@@ -64,7 +64,7 @@ static void node_shader_init_tex_brick(bNodeTree *UNUSED(ntree), bNode *node)
 
 	for (bNodeSocket *sock = node->inputs.first; sock; sock = sock->next) {
 		if (STREQ(sock->name, "Mortar Smooth")) {
-			((bNodeSocketValueFloat*)sock->default_value)->value = 0.1f;
+			((bNodeSocketValueFloat *)sock->default_value)->value = 0.1f;
 		}
 	}
 }
diff --git a/source/blender/physics/intern/BPH_mass_spring.cpp b/source/blender/physics/intern/BPH_mass_spring.cpp
index 359395b63c4..b694b6e994d 100644
--- a/source/blender/physics/intern/BPH_mass_spring.cpp
+++ b/source/blender/physics/intern/BPH_mass_spring.cpp
@@ -333,19 +333,14 @@ static int UNUSED_FUNCTION(cloth_calc_helper_forces)(Object *UNUSED(ob), ClothMo
 	return 1;
 }
 
-BLI_INLINE void cloth_calc_spring_force(ClothModifierData *clmd, ClothSpring *s, float time)
+BLI_INLINE void cloth_calc_spring_force(ClothModifierData *clmd, ClothSpring *s)
 {
 	Cloth *cloth = clmd->clothObject;
 	ClothSimSettings *parms = clmd->sim_parms;
 	Implicit_Data *data = cloth->implicit;
-	ClothVertex *verts = cloth->verts;
 	
 	bool no_compress = parms->flags & CLOTH_SIMSETTINGS_FLAG_NO_SPRING_COMPRESS;
 	
-	zero_v3(s->f);
-	zero_m3(s->dfdx);
-	zero_m3(s->dfdv);
-	
 	s->flags &= ~CLOTH_SPRING_FLAG_NEEDED;
 	
 	// calculate force of structural + shear springs
@@ -361,31 +356,13 @@ BLI_INLINE void cloth_calc_spring_force(ClothModifierData *clmd, ClothSpring *s,
 		if (s->type & CLOTH_SPRING_TYPE_SEWING) {
 			// TODO: verify, half verified (couldn't see error)
 			// sewing springs usually have a large distance at first so clamp the force so we don't get tunnelling through colission objects
-			BPH_mass_spring_force_spring_linear(data, s->ij, s->kl, s->restlen, k, parms->Cdis, no_compress, parms->max_sewing, s->f, s->dfdx, s->dfdv);
+			BPH_mass_spring_force_spring_linear(data, s->ij, s->kl, s->restlen, k, parms->Cdis, no_compress, parms->max_sewing);
 		}
 		else {
-			BPH_mass_spring_force_spring_linear(data, s->ij, s->kl, s->restlen, k, parms->Cdis, no_compress, 0.0f, s->f, s->dfdx, s->dfdv);
+			BPH_mass_spring_force_spring_linear(data, s->ij, s->kl, s->restlen, k, parms->Cdis, no_compress, 0.0f);
 		}
 #endif
 	}
-	else if (s->type & CLOTH_SPRING_TYPE_GOAL) {
-#ifdef CLOTH_FORCE_SPRING_GOAL
-		float goal_x[3], goal_v[3];
-		float k, scaling;
-		
-		s->flags |= CLOTH_SPRING_FLAG_NEEDED;
-		
-		// current_position = xold + t * (newposition - xold)
-		/* divide by time_scale to prevent goal vertices' delta locations from being multiplied */
-		interp_v3_v3v3(goal_x, verts[s->ij].xold, verts[s->ij].xconst, time / parms->time_scale);
-		sub_v3_v3v3(goal_v, verts[s->ij].xconst, verts[s->ij].xold); // distance covered over dt==1
-		
-		scaling = parms->goalspring + s->stiffness * fabsf(parms->max_struct - parms->goalspring);
-		k = verts[s->ij].goal * scaling / (parms->avg_spring_len + FLT_EPSILON);
-		
-		BPH_mass_spring_force_spring_goal(data, s->ij, goal_x, goal_v, k, parms->goalfrict * 0.01f, s->f, s->dfdx, s->dfdv);
-#endif
-	}
 	else if (s->type & CLOTH_SPRING_TYPE_BENDING) {  /* calculate force of bending springs */
 #ifdef CLOTH_FORCE_SPRING_BEND
 		float kb, cb, scaling;
@@ -398,7 +375,7 @@ BLI_INLINE void cloth_calc_spring_force(ClothModifierData *clmd, ClothSpring *s,
 		// Fix for [#45084] for cloth stiffness must have cb proportional to kb
 		cb = kb * parms->bending_damping;
 		
-		BPH_mass_spring_force_spring_bending(data, s->ij, s->kl, s->restlen, kb, cb, s->f, s->dfdx, s->dfdv);
+		BPH_mass_spring_force_spring_bending(data, s->ij, s->kl, s->restlen, kb, cb);
 #endif
 	}
 	else if (s->type & CLOTH_SPRING_TYPE_BENDING_ANG) {
@@ -474,9 +451,24 @@ static void cloth_calc_force(ClothModifierData *clmd, float UNUSED(frame), ListB
 		/* scale gravity force */
 		mul_v3_v3fl(gravity, clmd->scene->physics_settings.gravity, 0.001f * clmd->sim_parms->effector_weights->global_gravity);
 	}
+
 	vert = cloth->verts;
 	for (i = 0; i < cloth->mvert_num; i++, vert++) {
 		BPH_mass_spring_force_gravity(data, i, vert->mass, gravity);
+
+		/* Vertex goal springs */
+		if ((!(vert->flags & CLOTH_VERT_FLAG_PINNED)) && (vert->goal > FLT_EPSILON)) {
+			float goal_x[3], goal_v[3];
+			float k;
+
+			/* divide by time_scale to prevent goal vertices' delta locations from being multiplied */
+			interp_v3_v3v3(goal_x, vert->xold, vert->xconst, time / clmd->sim_parms->time_scale);
+			sub_v3_v3v3(goal_v, vert->xconst, vert->xold); /* distance covered over dt==1 */
+
+			k = vert->goal * clmd->sim_parms->goalspring / (clmd->sim_parms->avg_spring_len + FLT_EPSILON);
+
+			BPH_mass_spring_force_spring_goal(data, i, goal_x, goal_v, k, clmd->sim_parms->goalfrict * 0.01f);
+		}
 	}
 #endif
 
@@ -544,8 +536,9 @@ static void cloth_calc_force(ClothModifierData *clmd, float UNUSED(frame), ListB
 	for (LinkNode *link = cloth->springs; link; link = link->next) {
 		ClothSpring *spring = (ClothSpring *)link->link;
 		// only handle active springs
-		if (!(spring->flags & CLOTH_SPRING_FLAG_DEACTIVATE))
-			cloth_calc_spring_force(clmd, spring, time);
+		if (!(spring->flags & CLOTH_SPRING_FLAG_DEACTIVATE)) {
+			cloth_calc_spring_force(clmd, spring);
+		}
 	}
 }
 
diff --git a/source/blender/physics/intern/implicit.h b/source/blender/physics/intern/implicit.h
index d1a75ca5297..2f62ab98e12 100644
--- a/source/blender/physics/intern/implicit.h
+++ b/source/blender/physics/intern/implicit.h
@@ -114,19 +114,15 @@ void BPH_mass_spring_force_edge_wind(struct Implicit_Data *data, int v1, int v2,
 void BPH_mass_spring_force_vertex_wind(struct Implicit_Data *data, int v, float radius, const float (*winvec)[3]);
 /* Linear spring force between two points */
 bool BPH_mass_spring_force_spring_linear(struct Implicit_Data *data, int i, int j, float restlen,
-                                         float stiffness, float damping, bool no_compress, float clamp_force,
-                                         float r_f[3], float r_dfdx[3][3], float r_dfdv[3][3]);
+                                         float stiffness, float damping, bool no_compress, float clamp_force);
 /* Bending force, forming a triangle at the base of two structural springs */
-bool BPH_mass_spring_force_spring_bending(struct Implicit_Data *data, int i, int j, float restlen,
-                                          float kb, float cb,
-                                          float r_f[3], float r_dfdx[3][3], float r_dfdv[3][3]);
+bool BPH_mass_spring_force_spring_bending(struct Implicit_Data *data, int i, int j, float restlen, float kb, float cb);
 /* Angular bending force based on local target vectors */
 bool BPH_mass_spring_force_spring_bending_angular(struct Implicit_Data *data, int i, int j, int k,
                                                   const float target[3], float stiffness, float damping);
 /* Global goal spring */
 bool BPH_mass_spring_force_spring_goal(struct Implicit_Data *data, int i, const float goal_x[3], const float goal_v[3],
-                                       float stiffness, float damping,
-                                       float r_f[3], float r_dfdx[3][3], float r_dfdv[3][3]);
+                                       float stiffness, float damping);
 
 /* ======== Hair Volumetric Forces ======== */
 
diff --git a/source/blender/physics/intern/implicit_blender.c b/source/blender/physics/intern/implicit_blender.c
index 2ad8ee0547f..16cd335dc0c 100644
--- a/source/blender/physics/intern/implicit_blender.c
+++ b/source/blender/physics/intern/implicit_blender.c
@@ -1579,8 +1579,7 @@ BLI_INLINE void apply_spring(Implicit_Data *data, int i, int j, const float f[3]
 }
 
 bool BPH_mass_spring_force_spring_linear(Implicit_Data *data, int i, int j, float restlen,
-                                         float stiffness, float damping, bool no_compress, float clamp_force,
-                                         float r_f[3], float r_dfdx[3][3], float r_dfdv[3][3])
+                                         float stiffness, float damping, bool no_compress, float clamp_force)
 {
 	float extent[3], length, dir[3], vel[3];
 	
@@ -1608,25 +1607,15 @@ bool BPH_mass_spring_force_spring_linear(Implicit_Data *data, int i, int j, floa
 		
 		apply_spring(data, i, j, f, dfdx, dfdv);
 		
-		if (r_f) copy_v3_v3(r_f, f);
-		if (r_dfdx) copy_m3_m3(r_dfdx, dfdx);
-		if (r_dfdv) copy_m3_m3(r_dfdv, dfdv);
-		
 		return true;
 	}
 	else {
-		if (r_f) zero_v3(r_f);
-		if (r_dfdx) zero_m3(r_dfdx);
-		if (r_dfdv) zero_m3(r_dfdv);
-		
 		return false;
 	}
 }
 
 /* See "Stable but Responsive Cloth" (Choi, Ko 2005) */
-bool BPH_mass_spring_force_spring_bending(Implicit_Data *data, int i, int j, float restlen,
-                                          float kb, float cb,
-                                          float r_f[3], float r_dfdx[3][3], float r_dfdv[3][3])
+bool BPH_mass_spring_force_spring_bending(Implicit_Data *data, int i, int j, float restlen, float kb, float cb)
 {
 	float extent[3], length, dir[3], vel[3];
 	
@@ -1646,17 +1635,9 @@ bool BPH_mass_spring_force_spring_bending(Implicit_Data *data, int i, int j, flo
 		
 		apply_spring(data, i, j, f, dfdx, dfdv);
 		
-		if (r_f) copy_v3_v3(r_f, f);
-		if (r_dfdx) copy_m3_m3(r_dfdx, dfdx);
-		if (r_dfdv) copy_m3_m3(r_dfdv, dfdv);
-		
 		return true;
 	}
 	else {
-		if (r_f) zero_v3(r_f);
-		if (r_dfdx) zero_m3(r_dfdx);
-		if (r_dfdv) zero_m3(r_dfdv);
-		
 		return false;
 	}
 }
@@ -1945,8 +1926,7 @@ bool BPH_mass_spring_force_spring_bending_angular(Implicit_Data *data, int i, in
 }
 
 bool BPH_mass_spring_force_spring_goal(Implicit_Data *data, int i, const float goal_x[3], const float goal_v[3],
-                                       float stiffness, float damping,
-                                       float r_f[3], float r_dfdx[3][3], float r_dfdv[3][3])
+                                       float stiffness, float damping)
 {
 	float root_goal_x[3], root_goal_v[3], extent[3], length, dir[3], vel[3];
 	float f[3], dfdx[3][3], dfdv[3][3];
@@ -1973,17 +1953,9 @@ bool BPH_mass_spring_force_spring_goal(Implicit_Data *data, int i, const float g
 		add_m3_m3m3(data->dFdX[i].m, data->dFdX[i].m, dfdx);
 		add_m3_m3m3(data->dFdV[i].m, data->dFdV[i].m, dfdv);
 		
-		if (r_f) copy_v3_v3(r_f, f);
-		if (r_dfdx) copy_m3_m3(r_dfdx, dfdx);
-		if (r_dfdv) copy_m3_m3(r_dfdv, dfdv);
-		
 		return true;
 	}
 	else {
-		if (r_f) zero_v3(r_f);
-		if (r_dfdx) zero_m3(r_dfdx);
-		if (r_dfdv) zero_m3(r_dfdv);
-		
 		return false;
 	}
 }
diff --git a/source/blender/python/BPY_extern.h b/source/blender/python/BPY_extern.h
index 3148dab3c50..b4c36a7c516 100644
--- a/source/blender/python/BPY_extern.h
+++ b/source/blender/python/BPY_extern.h
@@ -75,7 +75,8 @@ void BPY_thread_restore(BPy_ThreadStatePtr tstate);
 
 bool	BPY_execute_filepath(struct bContext *C, const char *filepath, struct ReportList *reports);
 bool	BPY_execute_text(struct bContext *C, struct Text *text, struct ReportList *reports, const bool do_jump);
-bool	BPY_execute_string_as_number(struct bContext *C, const char *expr, double *value, const bool verbose);
+bool	BPY_execute_string_as_number(struct bContext *C, const char *expr, const bool verbose, double *r_value);
+bool	BPY_execute_string_as_string(struct bContext *C, const char *expr, const bool verbose, char **r_value);
 bool	BPY_execute_string_ex(struct bContext *C, const char *expr, bool use_eval);
 bool	BPY_execute_string(struct bContext *C, const char *expr);
 
diff --git a/source/blender/python/generic/py_capi_utils.c b/source/blender/python/generic/py_capi_utils.c
index 7b2d58a1268..2e789d6d4b3 100644
--- a/source/blender/python/generic/py_capi_utils.c
+++ b/source/blender/python/generic/py_capi_utils.c
@@ -918,11 +918,11 @@ char *PyC_FlagSet_AsString(PyC_FlagSet *item)
 	return cstring;
 }
 
-int PyC_FlagSet_ValueFromID_int(PyC_FlagSet *item, const char *identifier, int *value)
+int PyC_FlagSet_ValueFromID_int(PyC_FlagSet *item, const char *identifier, int *r_value)
 {
 	for ( ; item->identifier; item++) {
 		if (STREQ(item->identifier, identifier)) {
-			*value = item->value;
+			*r_value = item->value;
 			return 1;
 		}
 	}
@@ -930,9 +930,9 @@ int PyC_FlagSet_ValueFromID_int(PyC_FlagSet *item, const char *identifier, int *
 	return 0;
 }
 
-int PyC_FlagSet_ValueFromID(PyC_FlagSet *item, const char *identifier, int *value, const char *error_prefix)
+int PyC_FlagSet_ValueFromID(PyC_FlagSet *item, const char *identifier, int *r_value, const char *error_prefix)
 {
-	if (PyC_FlagSet_ValueFromID_int(item, identifier, value) == 0) {
+	if (PyC_FlagSet_ValueFromID_int(item, identifier, r_value) == 0) {
 		const char *enum_str = PyC_FlagSet_AsString(item);
 		PyErr_Format(PyExc_ValueError,
 		             "%s: '%.200s' not found in (%s)",
@@ -1006,7 +1006,7 @@ PyObject *PyC_FlagSet_FromBitfield(PyC_FlagSet *items, int flag)
  *
  * \note it is caller's responsibility to acquire & release GIL!
  */
-bool PyC_RunString_AsNumber(const char *expr, double *value, const char *filename)
+bool PyC_RunString_AsNumber(const char *expr, const char *filename, double *r_value)
 {
 	PyObject *py_dict, *mod, *retval;
 	bool ok = true;
@@ -1058,10 +1058,10 @@ bool PyC_RunString_AsNumber(const char *expr, double *value, const char *filenam
 			ok = false;
 		}
 		else if (!isfinite(val)) {
-			*value = 0.0;
+			*r_value = 0.0;
 		}
 		else {
-			*value = val;
+			*r_value = val;
 		}
 	}
 
@@ -1070,4 +1070,41 @@ bool PyC_RunString_AsNumber(const char *expr, double *value, const char *filenam
 	return ok;
 }
 
+bool PyC_RunString_AsString(const char *expr, const char *filename, char **r_value)
+{
+	PyObject *py_dict, *retval;
+	bool ok = true;
+	PyObject *main_mod = NULL;
+
+	PyC_MainModule_Backup(&main_mod);
+
+	py_dict = PyC_DefaultNameSpace(filename);
+
+	retval = PyRun_String(expr, Py_eval_input, py_dict, py_dict);
+
+	if (retval == NULL) {
+		ok = false;
+	}
+	else {
+		const char *val;
+		Py_ssize_t val_len;
+
+		val = _PyUnicode_AsStringAndSize(retval, &val_len);
+		if (val == NULL && PyErr_Occurred()) {
+			ok = false;
+		}
+		else {
+			char *val_alloc = MEM_mallocN(val_len + 1, __func__);
+			memcpy(val_alloc, val, val_len + 1);
+			*r_value = val_alloc;
+		}
+
+		Py_DECREF(retval);
+	}
+
+	PyC_MainModule_Restore(main_mod);
+
+	return ok;
+}
+
 #endif  /* #ifndef MATH_STANDALONE */
diff --git a/source/blender/python/generic/py_capi_utils.h b/source/blender/python/generic/py_capi_utils.h
index 04cfc8801eb..3f89e1d82a0 100644
--- a/source/blender/python/generic/py_capi_utils.h
+++ b/source/blender/python/generic/py_capi_utils.h
@@ -75,12 +75,13 @@ typedef struct PyC_FlagSet {
 } PyC_FlagSet;
 
 char     *PyC_FlagSet_AsString(PyC_FlagSet *item);
-int       PyC_FlagSet_ValueFromID_int(PyC_FlagSet *item, const char *identifier, int *value);
-int       PyC_FlagSet_ValueFromID(PyC_FlagSet *item, const char *identifier, int *value, const char *error_prefix);
+int       PyC_FlagSet_ValueFromID_int(PyC_FlagSet *item, const char *identifier, int *r_value);
+int       PyC_FlagSet_ValueFromID(PyC_FlagSet *item, const char *identifier, int *r_value, const char *error_prefix);
 int       PyC_FlagSet_ToBitfield(PyC_FlagSet *items, PyObject *value, int *r_value, const char *error_prefix);
 PyObject *PyC_FlagSet_FromBitfield(PyC_FlagSet *items, int flag);
 
-bool PyC_RunString_AsNumber(const char *expr, double *value, const char *filename);
+bool PyC_RunString_AsNumber(const char *expr, const char *filename, double *r_value);
+bool PyC_RunString_AsString(const char *expr, const char *filename, char **r_value);
 
 int PyC_ParseBool(PyObject *o, void *p);
 
diff --git a/source/blender/python/intern/bpy_interface.c b/source/blender/python/intern/bpy_interface.c
index 311f621e13b..55e477b0214 100644
--- a/source/blender/python/intern/bpy_interface.c
+++ b/source/blender/python/intern/bpy_interface.c
@@ -572,21 +572,58 @@ void BPY_DECREF_RNA_INVALIDATE(void *pyob_ptr)
 /**
  * \return success
  */
-bool BPY_execute_string_as_number(bContext *C, const char *expr, double *value, const bool verbose)
+bool BPY_execute_string_as_number(bContext *C, const char *expr, const bool verbose, double *r_value)
 {
 	PyGILState_STATE gilstate;
 	bool ok = true;
 
-	if (!value || !expr) return -1;
+	if (!r_value || !expr) {
+		return -1;
+	}
+
+	if (expr[0] == '\0') {
+		*r_value = 0.0;
+		return ok;
+	}
+
+	bpy_context_set(C, &gilstate);
+
+	ok = PyC_RunString_AsNumber(expr, "<blender button>", r_value);
+
+	if (ok == false) {
+		if (verbose) {
+			BPy_errors_to_report_ex(CTX_wm_reports(C), false, false);
+		}
+		else {
+			PyErr_Clear();
+		}
+	}
+
+	bpy_context_clear(C, &gilstate);
+
+	return ok;
+}
+
+/**
+ * \return success
+ */
+bool BPY_execute_string_as_string(bContext *C, const char *expr, const bool verbose, char **r_value)
+{
+	PyGILState_STATE gilstate;
+	bool ok = true;
+
+	if (!r_value || !expr) {
+		return -1;
+	}
 
 	if (expr[0] == '\0') {
-		*value = 0.0;
+		*r_value = NULL;
 		return ok;
 	}
 
 	bpy_context_set(C, &gilstate);
 
-	ok = PyC_RunString_AsNumber(expr, value, "<blender button>");
+	ok = PyC_RunString_AsString(expr, "<blender button>", r_value);
 
 	if (ok == false) {
 		if (verbose) {
@@ -602,6 +639,7 @@ bool BPY_execute_string_as_number(bContext *C, const char *expr, double *value,
 	return ok;
 }
 
+
 bool BPY_execute_string_ex(bContext *C, const char *expr, bool use_eval)
 {
 	PyGILState_STATE gilstate;
diff --git a/source/blender/python/intern/bpy_utils_units.c b/source/blender/python/intern/bpy_utils_units.c
index 974d7c5549c..e0c70483e3c 100644
--- a/source/blender/python/intern/bpy_utils_units.c
+++ b/source/blender/python/intern/bpy_utils_units.c
@@ -201,7 +201,7 @@ static PyObject *bpyunits_to_value(PyObject *UNUSED(self), PyObject *args, PyObj
 
 	bUnit_ReplaceString(str, (int)str_len, uref, scale, usys, ucat);
 
-	if (!PyC_RunString_AsNumber(str, &result, "<bpy_units_api>")) {
+	if (!PyC_RunString_AsNumber(str, "<bpy_units_api>", &result)) {
 		if (PyErr_Occurred()) {
 			PyErr_Print();
 			PyErr_Clear();
diff --git a/source/blender/python/intern/gpu_offscreen.c b/source/blender/python/intern/gpu_offscreen.c
index c4863b2a92f..7711ce18bd0 100644
--- a/source/blender/python/intern/gpu_offscreen.c
+++ b/source/blender/python/intern/gpu_offscreen.c
@@ -202,7 +202,7 @@ static PyObject *pygpu_offscreen_draw_view3d(BPy_GPUOffScreen *self, PyObject *a
 	ARegion *ar;
 	GPUFX *fx;
 	GPUFXSettings fx_settings;
-	void *rv3d_mats;
+	struct RV3DMatrixStore *rv3d_mats;
 
 	BPY_GPU_OFFSCREEN_CHECK_OBJ(self);
 
diff --git a/source/blender/python/mathutils/mathutils_Matrix.c b/source/blender/python/mathutils/mathutils_Matrix.c
index 4e980e4c0e6..bd44e77e7c6 100644
--- a/source/blender/python/mathutils/mathutils_Matrix.c
+++ b/source/blender/python/mathutils/mathutils_Matrix.c
@@ -1301,7 +1301,7 @@ PyDoc_STRVAR(Matrix_to_scale_doc,
 "   :return: Return the scale of a matrix.\n"
 "   :rtype: :class:`Vector`\n"
 "\n"
-"   .. note:: This method does not return negative a scale on any axis because it is not possible to obtain this data from the matrix alone.\n"
+"   .. note:: This method does not return a negative scale on any axis because it is not possible to obtain this data from the matrix alone.\n"
 );
 static PyObject *Matrix_to_scale(MatrixObject *self)
 {
@@ -1390,11 +1390,11 @@ PyDoc_STRVAR(Matrix_invert_doc,
 "\n"
 "   Set the matrix to its inverse.\n"
 "\n"
-"   :arg fallback: Set the matrix to this value when the inverse can't be calculated\n"
+"   :arg fallback: Set the matrix to this value when the inverse cannot be calculated\n"
 "      (instead of raising a :exc:`ValueError` exception).\n"
 "   :type fallback: :class:`Matrix`\n"
 "\n"
-"   .. seealso:: <https://en.wikipedia.org/wiki/Inverse_matrix>\n"
+"   .. seealso:: `Inverse matrix <https://en.wikipedia.org/wiki/Inverse_matrix>` on Wikipedia.\n"
 );
 static PyObject *Matrix_invert(MatrixObject *self, PyObject *args)
 {
@@ -1505,7 +1505,7 @@ PyDoc_STRVAR(Matrix_invert_safe_doc,
 "   If degenerated (e.g. zero scale on an axis), add some epsilon to its diagonal, to get an invertible one.\n"
 "   If tweaked matrix is still degenerated, set to the identity matrix instead.\n"
 "\n"
-"   .. seealso:: <https://en.wikipedia.org/wiki/Inverse_matrix>\n"
+"   .. seealso:: `Inverse Matrix <https://en.wikipedia.org/wiki/Inverse_matrix>` on Wikipedia.\n"
 );
 static PyObject *Matrix_invert_safe(MatrixObject *self)
 {
@@ -1554,9 +1554,9 @@ PyDoc_STRVAR(Matrix_adjugate_doc,
 "\n"
 "   Set the matrix to its adjugate.\n"
 "\n"
-"   .. note:: When the matrix cant be adjugated a :exc:`ValueError` exception is raised.\n"
+"   .. note:: When the matrix cannot be adjugated a :exc:`ValueError` exception is raised.\n"
 "\n"
-"   .. seealso:: <https://en.wikipedia.org/wiki/Adjugate_matrix>\n"
+"   .. seealso:: `Adjugate matrix <https://en.wikipedia.org/wiki/Adjugate_matrix>` on Wikipedia.\n"
 );
 static PyObject *Matrix_adjugate(MatrixObject *self)
 {
@@ -1733,7 +1733,7 @@ PyDoc_STRVAR(Matrix_determinant_doc,
 "   :return: Return the determinant of a matrix.\n"
 "   :rtype: float\n"
 "\n"
-"   .. seealso:: <https://en.wikipedia.org/wiki/Determinant>\n"
+"   .. seealso:: `Determinant <https://en.wikipedia.org/wiki/Determinant>` on Wikipedia.\n"
 );
 static PyObject *Matrix_determinant(MatrixObject *self)
 {
@@ -1755,7 +1755,7 @@ PyDoc_STRVAR(Matrix_transpose_doc,
 "\n"
 "   Set the matrix to its transpose.\n"
 "\n"
-"   .. seealso:: <https://en.wikipedia.org/wiki/Transpose>\n"
+"   .. seealso:: `Transpose <https://en.wikipedia.org/wiki/Transpose>` on Wikipedia.\n"
 );
 static PyObject *Matrix_transpose(MatrixObject *self)
 {
@@ -1887,10 +1887,10 @@ PyDoc_STRVAR(Matrix_identity_doc,
 "\n"
 "   Set the matrix to the identity matrix.\n"
 "\n"
-"   .. note:: An object with zero location and rotation, a scale of one,\n"
+"   .. note:: An object with a location and rotation of zero, and a scale of one\n"
 "      will have an identity matrix.\n"
 "\n"
-"   .. seealso:: <https://en.wikipedia.org/wiki/Identity_matrix>\n"
+"   .. seealso:: `Identity matrix <https://en.wikipedia.org/wiki/Identity_matrix>` on Wikipedia.\n"
 );
 static PyObject *Matrix_identity(MatrixObject *self)
 {
diff --git a/source/blender/render/CMakeLists.txt b/source/blender/render/CMakeLists.txt
index 9e40ab02ee4..569b207c966 100644
--- a/source/blender/render/CMakeLists.txt
+++ b/source/blender/render/CMakeLists.txt
@@ -35,6 +35,7 @@ set(INC
 	../makesdna
 	../makesrna
 	../physics
+	../../../intern/atomic
 	../../../intern/guardedalloc
 	../../../intern/mikktspace
 	../../../intern/smoke/extern
diff --git a/source/blender/render/intern/source/pointdensity.c b/source/blender/render/intern/source/pointdensity.c
index a03ea9cb896..fb047aad897 100644
--- a/source/blender/render/intern/source/pointdensity.c
+++ b/source/blender/render/intern/source/pointdensity.c
@@ -983,11 +983,12 @@ void RE_point_density_minmax(
 	}
 	else {
 		float radius[3] = {pd->radius, pd->radius, pd->radius};
-		float *loc, *size;
+		BoundBox *bb = BKE_object_boundbox_get(object);
 
-		if (BKE_object_obdata_texspace_get(pd->object, NULL, &loc, &size, NULL)) {
-			sub_v3_v3v3(r_min, loc, size);
-			add_v3_v3v3(r_max, loc, size);
+		if (bb != NULL) {
+			BLI_assert((bb->flag & BOUNDBOX_DIRTY) == 0);
+			copy_v3_v3(r_min, bb->vec[0]);
+			copy_v3_v3(r_max, bb->vec[6]);
 			/* Adjust texture space to include density points on the boundaries. */
 			sub_v3_v3(r_min, radius);
 			add_v3_v3(r_max, radius);
diff --git a/source/blender/render/intern/source/render_texture.c b/source/blender/render/intern/source/render_texture.c
index b4a14f5337d..42e4d4f27f6 100644
--- a/source/blender/render/intern/source/render_texture.c
+++ b/source/blender/render/intern/source/render_texture.c
@@ -1112,14 +1112,15 @@ static int multitex(Tex *tex,
                     const short which_output,
                     struct ImagePool *pool,
                     const bool skip_load_image,
-                    const bool texnode_preview)
+                    const bool texnode_preview,
+                    const bool use_nodes)
 {
 	float tmpvec[3];
 	int retval = 0; /* return value, int:0, col:1, nor:2, everything:3 */
 
 	texres->talpha = false;  /* is set when image texture returns alpha (considered premul) */
 	
-	if (tex->use_nodes && tex->nodetree) {
+	if (use_nodes && tex->use_nodes && tex->nodetree) {
 		retval = ntreeTexExecTree(tex->nodetree, texres, texvec, dxt, dyt, osatex, thread,
 		                          tex, which_output, R.r.cfra, texnode_preview, NULL, NULL);
 	}
@@ -1239,7 +1240,8 @@ static int multitex_nodes_intern(Tex *tex,
                                  ImagePool *pool,
                                  const bool scene_color_manage,
                                  const bool skip_load_image,
-                                 const bool texnode_preview)
+                                 const bool texnode_preview,
+                                 const bool use_nodes)
 {
 	if (tex==NULL) {
 		memset(texres, 0, sizeof(TexResult));
@@ -1264,7 +1266,8 @@ static int multitex_nodes_intern(Tex *tex,
 			                  which_output,
 			                  pool,
 			                  skip_load_image,
-			                  texnode_preview);
+			                  texnode_preview,
+			                  use_nodes);
 
 			if (mtex->mapto & (MAP_COL+MAP_COLSPEC+MAP_COLMIR)) {
 				ImBuf *ibuf = BKE_image_pool_acquire_ibuf(tex->ima, &tex->iuser, pool);
@@ -1311,7 +1314,8 @@ static int multitex_nodes_intern(Tex *tex,
 			                  which_output,
 			                  pool,
 			                  skip_load_image,
-			                  texnode_preview);
+			                  texnode_preview,
+			                  use_nodes);
 
 			{
 				ImBuf *ibuf = BKE_image_pool_acquire_ibuf(tex->ima, &tex->iuser, pool);
@@ -1341,7 +1345,8 @@ static int multitex_nodes_intern(Tex *tex,
 		                which_output,
 		                pool,
 		                skip_load_image,
-		                texnode_preview);
+		                texnode_preview,
+		                use_nodes);
 	}
 }
 
@@ -1354,7 +1359,8 @@ int multitex_nodes(Tex *tex, float texvec[3], float dxt[3], float dyt[3], int os
 	return multitex_nodes_intern(tex, texvec, dxt, dyt, osatex, texres,
 	                             thread, which_output, shi, mtex, pool, R.scene_color_manage,
 	                             (R.r.scemode & R_NO_IMAGE_LOAD) != 0,
-	                             (R.r.scemode & R_TEXNODE_PREVIEW) != 0);
+	                             (R.r.scemode & R_TEXNODE_PREVIEW) != 0,
+	                             true);
 }
 
 /* this is called for surface shading */
@@ -1378,7 +1384,8 @@ static int multitex_mtex(ShadeInput *shi, MTex *mtex, float texvec[3], float dxt
 		                mtex->which_output,
 		                pool,
 		                skip_load_image,
-		                (R.r.scemode & R_TEXNODE_PREVIEW) != 0);
+		                (R.r.scemode & R_TEXNODE_PREVIEW) != 0,
+		                true);
 	}
 }
 
@@ -1408,7 +1415,8 @@ int multitex_ext(Tex *tex,
 	                             pool,
 	                             scene_color_manage,
 	                             skip_load_image,
-	                             false);
+	                             false,
+	                             true);
 }
 
 /* extern-tex doesn't support nodes (ntreeBeginExec() can't be called when rendering is going on)\
@@ -1417,13 +1425,19 @@ int multitex_ext(Tex *tex,
  */
 int multitex_ext_safe(Tex *tex, float texvec[3], TexResult *texres, struct ImagePool *pool, bool scene_color_manage, const bool skip_load_image)
 {
-	int use_nodes= tex->use_nodes, retval;
-	
-	tex->use_nodes = false;
-	retval= multitex_nodes_intern(tex, texvec, NULL, NULL, 0, texres, 0, 0, NULL, NULL, pool, scene_color_manage, skip_load_image, false);
-	tex->use_nodes= use_nodes;
-	
-	return retval;
+	return multitex_nodes_intern(tex,
+	                             texvec,
+	                             NULL, NULL,
+	                             0,
+	                             texres,
+	                             0,
+	                             0,
+	                             NULL, NULL,
+	                             pool,
+	                             scene_color_manage,
+	                             skip_load_image,
+	                             false,
+	                             false);
 }
 
 
@@ -2873,7 +2887,8 @@ void do_volume_tex(ShadeInput *shi, const float *xyz, int mapto_flag, float col_
 			                  mtex->which_output,
 			                  re->pool,
 			                  skip_load_image,
-			                  texnode_preview);	/* NULL = dxt/dyt, 0 = shi->osatex - not supported */
+			                  texnode_preview,
+			                  true);	/* NULL = dxt/dyt, 0 = shi->osatex - not supported */
 			
 			/* texture output */
 
@@ -3051,7 +3066,8 @@ void do_halo_tex(HaloRen *har, float xn, float yn, float col_r[4])
 	               mtex->which_output,
 	               har->pool,
 	               skip_load_image,
-	               texnode_preview);
+	               texnode_preview,
+	               true);
 
 	/* texture output */
 	if (rgb && (mtex->texflag & MTEX_RGBTOINT)) {
@@ -3274,7 +3290,8 @@ void do_sky_tex(
 			               mtex->which_output,
 			               R.pool,
 			               skip_load_image,
-			               texnode_preview);
+			               texnode_preview,
+			               true);
 			
 			/* texture output */
 			if (rgb && (mtex->texflag & MTEX_RGBTOINT)) {
@@ -3500,7 +3517,8 @@ void do_lamp_tex(LampRen *la, const float lavec[3], ShadeInput *shi, float col_r
 			               mtex->which_output,
 			               R.pool,
 			               skip_load_image,
-			               texnode_preview);
+			               texnode_preview,
+			               true);
 
 			/* texture output */
 			if (rgb && (mtex->texflag & MTEX_RGBTOINT)) {
@@ -3614,7 +3632,8 @@ int externtex(MTex *mtex,
 	               mtex->which_output,
 	               pool,
 	               skip_load_image,
-	               texnode_preview);
+	               texnode_preview,
+	               true);
 	
 	if (rgb) {
 		texr.tin = IMB_colormanagement_get_luminance(&texr.tr);
diff --git a/source/blender/render/intern/source/shadeoutput.c b/source/blender/render/intern/source/shadeoutput.c
index c4de95e863e..a8fb72fb7f8 100644
--- a/source/blender/render/intern/source/shadeoutput.c
+++ b/source/blender/render/intern/source/shadeoutput.c
@@ -2143,7 +2143,7 @@ const float (*RE_object_instance_get_matrix(struct ObjectInstanceRen *obi, int m
 
 const float (*RE_render_current_get_matrix(int matrix_id))[4]
 {
-	switch(matrix_id) {
+	switch (matrix_id) {
 		case RE_VIEW_MATRIX:
 			return (const float(*)[4])R.viewmat;
 		case RE_VIEWINV_MATRIX:
@@ -2171,4 +2171,4 @@ float RE_fresnel_dielectric(float incoming[3], float normal[3], float eta)
 	}
 
 	return result;
-}
-\ No newline at end of file
+}
diff --git a/source/blender/render/intern/source/volume_precache.c b/source/blender/render/intern/source/volume_precache.c
index 5377d0eba00..752a9df0b79 100644
--- a/source/blender/render/intern/source/volume_precache.c
+++ b/source/blender/render/intern/source/volume_precache.c
@@ -60,6 +60,8 @@
 #include "volumetric.h"
 #include "volume_precache.h"
 
+#include "atomic_ops.h"
+
 
 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
 /* defined in pipeline.c, is hardcopy of active dynamic allocated Render */
@@ -509,7 +511,8 @@ static void *vol_precache_part_test(void *data)
  */
 typedef struct VolPrecacheState {
 	double lasttime;
-	int totparts;
+	unsigned int doneparts;
+	unsigned int totparts;
 } VolPrecacheState;
 
 static void vol_precache_part(TaskPool * __restrict pool, void *taskdata, int UNUSED(threadid))
@@ -574,13 +577,15 @@ static void vol_precache_part(TaskPool * __restrict pool, void *taskdata, int UN
 		}
 	}
 
+	unsigned int doneparts = atomic_add_and_fetch_u(&state->doneparts, 1);
+
 	time = PIL_check_seconds_timer();
 	if (time - state->lasttime > 1.0) {
 		ThreadMutex *mutex = BLI_task_pool_user_mutex(pool);
 
 		if (BLI_mutex_trylock(mutex)) {
 			char str[64];
-			float ratio = (float)BLI_task_pool_tasks_done(pool)/(float)state->totparts;
+			float ratio = (float)doneparts/(float)state->totparts;
 			BLI_snprintf(str, sizeof(str), IFACE_("Precaching volume: %d%%"), (int)(100.0f * ratio));
 			re->i.infostr = str;
 			re->stats_draw(re->sdh, &re->i);
@@ -631,6 +636,7 @@ static void precache_launch_parts(Render *re, RayObject *tree, ShadeInput *shi,
 	
 	/* setup task scheduler */
 	memset(&state, 0, sizeof(state));
+	state.doneparts = 0;
 	state.totparts = parts[0]*parts[1]*parts[2];
 	state.lasttime = PIL_check_seconds_timer();
 	
diff --git a/source/blender/windowmanager/CMakeLists.txt b/source/blender/windowmanager/CMakeLists.txt
index b6245a8c0d1..925563a0777 100644
--- a/source/blender/windowmanager/CMakeLists.txt
+++ b/source/blender/windowmanager/CMakeLists.txt
@@ -128,10 +128,6 @@ if(WITH_PYTHON)
 		../python
 	)
 	add_definitions(-DWITH_PYTHON)
-
-	if(WITH_PYTHON_SECURITY)
-		add_definitions(-DWITH_PYTHON_SECURITY)
-	endif()
 endif()
 
 if(WITH_GAMEENGINE)
diff --git a/source/blender/windowmanager/WM_api.h b/source/blender/windowmanager/WM_api.h
index 2b82f1becb3..1cfd20defe0 100644
--- a/source/blender/windowmanager/WM_api.h
+++ b/source/blender/windowmanager/WM_api.h
@@ -252,6 +252,7 @@ int         WM_operator_confirm_message(struct bContext *C, struct wmOperator *o
 
 		/* operator api */
 void		WM_operator_free		(struct wmOperator *op);
+void		WM_operator_free_all_after(wmWindowManager *wm, struct wmOperator *op);
 void		WM_operator_type_set(struct wmOperator *op, struct wmOperatorType *ot);
 void		WM_operator_stack_clear(struct wmWindowManager *wm);
 void		WM_operator_handlers_clear(wmWindowManager *wm, struct wmOperatorType *ot);
@@ -276,6 +277,7 @@ int			WM_operator_call		(struct bContext *C, struct wmOperator *op);
 int			WM_operator_call_notest(struct bContext *C, struct wmOperator *op);
 int			WM_operator_repeat		(struct bContext *C, struct wmOperator *op);
 bool        WM_operator_repeat_check(const struct bContext *C, struct wmOperator *op);
+bool        WM_operator_is_repeat(const struct bContext *C, const struct wmOperator *op);
 int         WM_operator_name_call_ptr(struct bContext *C, struct wmOperatorType *ot, short context, struct PointerRNA *properties);
 int			WM_operator_name_call(struct bContext *C, const char *opstring, short context, struct PointerRNA *properties);
 int			WM_operator_call_py(struct bContext *C, struct wmOperatorType *ot, short context, struct PointerRNA *properties, struct ReportList *reports, const bool is_undo);
diff --git a/source/blender/windowmanager/WM_types.h b/source/blender/windowmanager/WM_types.h
index cd46e24264d..97f9257b4cb 100644
--- a/source/blender/windowmanager/WM_types.h
+++ b/source/blender/windowmanager/WM_types.h
@@ -542,7 +542,15 @@ typedef struct wmOperatorType {
 	 * canceled due to some external reason, cancel is called
 	 * - see defines below for return values */
 	int (*invoke)(struct bContext *, struct wmOperator *, const struct wmEvent *) ATTR_WARN_UNUSED_RESULT;
+
+	/* Called when a modal operator is canceled (not used often).
+	 * Internal cleanup can be done here if needed. */
 	void (*cancel)(struct bContext *, struct wmOperator *);
+
+	/* Modal is used for operators which continuously run, eg:
+	 * fly mode, knife tool, circle select are all examples of modal operators.
+	 * Modal operators can handle events which would normally access other operators,
+	 * they keep running until they don't return `OPERATOR_RUNNING_MODAL`. */
 	int (*modal)(struct bContext *, struct wmOperator *, const struct wmEvent *) ATTR_WARN_UNUSED_RESULT;
 
 	/* verify if the operator can be executed in the current context, note
diff --git a/source/blender/windowmanager/intern/wm.c b/source/blender/windowmanager/intern/wm.c
index b76a1f1d422..4351cd22b18 100644
--- a/source/blender/windowmanager/intern/wm.c
+++ b/source/blender/windowmanager/intern/wm.c
@@ -107,6 +107,17 @@ void WM_operator_free(wmOperator *op)
 	MEM_freeN(op);
 }
 
+void WM_operator_free_all_after(wmWindowManager *wm, struct wmOperator *op)
+{
+	op = op->next;
+	while (op != NULL) {
+		wmOperator *op_next = op->next;
+		BLI_remlink(&wm->operators, op);
+		WM_operator_free(op);
+		op = op_next;
+	}
+}
+
 /**
  * Use with extreme care!,
  * properties, customdata etc - must be compatible.
@@ -149,18 +160,23 @@ static void wm_reports_free(wmWindowManager *wm)
 void wm_operator_register(bContext *C, wmOperator *op)
 {
 	wmWindowManager *wm = CTX_wm_manager(C);
-	int tot;
+	int tot = 0;
 
 	BLI_addtail(&wm->operators, op);
-	tot = BLI_listbase_count(&wm->operators);
-	
-	while (tot > MAX_OP_REGISTERED) {
-		wmOperator *opt = wm->operators.first;
-		BLI_remlink(&wm->operators, opt);
-		WM_operator_free(opt);
-		tot--;
+
+	/* only count registered operators */
+	while (op) {
+		wmOperator *op_prev = op->prev;
+		if (op->type->flag & OPTYPE_REGISTER) {
+			tot += 1;
+		}
+		if (tot > MAX_OP_REGISTERED) {
+			BLI_remlink(&wm->operators, op);
+			WM_operator_free(op);
+		}
+		op = op_prev;
 	}
-	
+
 	/* so the console is redrawn */
 	WM_event_add_notifier(C, NC_SPACE | ND_SPACE_INFO_REPORT, NULL);
 	WM_event_add_notifier(C, NC_WM | ND_HISTORY, NULL);
diff --git a/source/blender/windowmanager/intern/wm_event_system.c b/source/blender/windowmanager/intern/wm_event_system.c
index d2b0acd836b..f26ee0e73ec 100644
--- a/source/blender/windowmanager/intern/wm_event_system.c
+++ b/source/blender/windowmanager/intern/wm_event_system.c
@@ -715,7 +715,9 @@ static void wm_operator_reports(bContext *C, wmOperator *op, int retval, bool ca
  */
 static bool wm_operator_register_check(wmWindowManager *wm, wmOperatorType *ot)
 {
-	return wm && (wm->op_undo_depth == 0) && (ot->flag & OPTYPE_REGISTER);
+	/* Check undo flag here since undo operators are also added to the list,
+	 * to support checking if the same operator is run twice. */
+	return wm && (wm->op_undo_depth == 0) && (ot->flag & (OPTYPE_REGISTER | OPTYPE_UNDO));
 }
 
 static void wm_operator_finished(bContext *C, wmOperator *op, const bool repeat)
@@ -878,6 +880,20 @@ bool WM_operator_repeat_check(const bContext *UNUSED(C), wmOperator *op)
 	return false;
 }
 
+bool WM_operator_is_repeat(const bContext *C, const wmOperator *op)
+{
+	/* may be in the operators list or not */
+	wmOperator *op_prev;
+	if (op->prev == NULL && op->next == NULL) {
+		wmWindowManager *wm = CTX_wm_manager(C);
+		op_prev = wm->operators.last;
+	}
+	else {
+		op_prev = op->prev;
+	}
+	return (op_prev && (op->type == op_prev->type));
+}
+
 static wmOperator *wm_operator_create(wmWindowManager *wm, wmOperatorType *ot,
                                       PointerRNA *properties, ReportList *reports)
 {
@@ -3179,6 +3195,8 @@ void wm_event_add_ghostevent(wmWindowManager *wm, wmWindow *win, int type, int U
 			GHOST_TEventCursorData *cd = customdata;
 
 			copy_v2_v2_int(&event.x, &cd->x);
+			wm_stereo3d_mouse_offset_apply(win, &event.x);
+
 			event.type = MOUSEMOVE;
 			wm_event_add_mousemove(win, &event);
 			copy_v2_v2_int(&evt->x, &event.x);
diff --git a/source/blender/windowmanager/intern/wm_files.c b/source/blender/windowmanager/intern/wm_files.c
index 05d63869074..028355f9af3 100644
--- a/source/blender/windowmanager/intern/wm_files.c
+++ b/source/blender/windowmanager/intern/wm_files.c
@@ -316,7 +316,7 @@ static void wm_window_match_do(bContext *C, ListBase *oldwmlist)
 }
 
 /* in case UserDef was read, we re-initialize all, and do versioning */
-static void wm_init_userdef(bContext *C, const bool from_memory)
+static void wm_init_userdef(bContext *C, const bool use_factory_settings)
 {
 	Main *bmain = CTX_data_main(C);
 
@@ -336,7 +336,7 @@ static void wm_init_userdef(bContext *C, const bool from_memory)
 	}
 
 	/* avoid re-saving for every small change to our prefs, allow overrides */
-	if (from_memory) {
+	if (use_factory_settings) {
 		BLO_update_defaults_userpref_blend();
 	}
 
@@ -470,6 +470,10 @@ static void wm_file_read_post(bContext *C, bool is_startup_file)
 	if (is_startup_file) {
 		/* possible python hasn't been initialized */
 		if (CTX_py_init_get(C)) {
+			/* Only run when we have a template path found. */
+			if (BKE_appdir_app_template_any()) {
+				BPY_execute_string(C, "__import__('bl_app_template_utils').reset()");
+			}
 			/* sync addons, these may have changed from the defaults */
 			BPY_execute_string(C, "__import__('addon_utils').reset_all()");
 
@@ -554,7 +558,7 @@ bool WM_file_read(bContext *C, const char *filepath, ReportList *reports)
 		
 		/* confusing this global... */
 		G.relbase_valid = 1;
-		retval = BKE_blendfile_read(C, filepath, reports);
+		retval = BKE_blendfile_read(C, filepath, reports, 0);
 		/* when loading startup.blend's, we can be left with a blank path */
 		if (G.main->name[0]) {
 			G.save_over = 1;
@@ -629,22 +633,34 @@ bool WM_file_read(bContext *C, const char *filepath, ReportList *reports)
 
 
 /**
- * called on startup,  (context entirely filled with NULLs)
- * or called for 'New File'
- * both startup.blend and userpref.blend are checked
- * the optional parameter custom_file points to an alternative startup page
- * custom_file can be NULL
+ * Called on startup, (context entirely filled with NULLs)
+ * or called for 'New File' both startup.blend and userpref.blend are checked.
+ *
+ * \param use_factory_settings: Ignore on-disk startup file, use bundled ``datatoc_startup_blend`` instead.
+ * Used for "Restore Factory Settings".
+ * \param filepath_startup_override: Optional path pointing to an alternative blend file (may be NULL).
+ * \param app_template_override: Template to use instead of the template defined in user-preferences.
+ * When not-null, this is written into the user preferences.
  */
-int wm_homefile_read(bContext *C, ReportList *reports, bool from_memory, const char *custom_file)
+int wm_homefile_read(
+        bContext *C, ReportList *reports,
+        bool use_factory_settings, bool use_empty_data,
+        const char *filepath_startup_override, const char *app_template_override)
 {
 	ListBase wmbase;
-	char startstr[FILE_MAX];
-	char prefstr[FILE_MAX];
-	int success = 0;
+	bool success = false;
+
+	char filepath_startup[FILE_MAX];
+	char filepath_userdef[FILE_MAX];
+
+	/* When 'app_template' is set: '{BLENDER_USER_CONFIG}/{app_template}' */
+	char app_template_system[FILE_MAX];
+	/* When 'app_template' is set: '{BLENDER_SYSTEM_SCRIPTS}/startup/bl_app_templates_system/{app_template}' */
+	char app_template_config[FILE_MAX];
 
 	/* Indicates whether user preferences were really load from memory.
 	 *
-	 * This is used for versioning code, and for this we can not rely on from_memory
+	 * This is used for versioning code, and for this we can not rely on use_factory_settings
 	 * passed via argument. This is because there might be configuration folder
 	 * exists but it might not have userpref.blend and in this case we fallback to
 	 * reading home file from memory.
@@ -652,9 +668,10 @@ int wm_homefile_read(bContext *C, ReportList *reports, bool from_memory, const c
 	 * And in this case versioning code is to be run.
 	 */
 	bool read_userdef_from_memory = true;
+	eBLOReadSkip skip_flags = 0;
 
 	/* options exclude eachother */
-	BLI_assert((from_memory && custom_file) == 0);
+	BLI_assert((use_factory_settings && filepath_startup_override) == 0);
 
 	if ((G.f & G_SCRIPT_OVERRIDE_PREF) == 0) {
 		BKE_BIT_TEST_SET(G.f, (U.flag & USER_SCRIPT_AUTOEXEC_DISABLE) == 0, G_SCRIPT_AUTOEXEC);
@@ -665,71 +682,146 @@ int wm_homefile_read(bContext *C, ReportList *reports, bool from_memory, const c
 	UI_view2d_zoom_cache_reset();
 
 	G.relbase_valid = 0;
-	if (!from_memory) {
-		const char * const cfgdir = BKE_appdir_folder_id(BLENDER_USER_CONFIG, NULL);
-		if (custom_file) {
-			BLI_strncpy(startstr, custom_file, FILE_MAX);
 
-			if (cfgdir) {
-				BLI_make_file_string(G.main->name, prefstr, cfgdir, BLENDER_USERPREF_FILE);
-			}
-			else {
-				prefstr[0] = '\0';
-			}
+	/* put aside screens to match with persistent windows later */
+	wm_window_match_init(C, &wmbase);
+
+	filepath_startup[0] = '\0';
+	filepath_userdef[0] = '\0';
+	app_template_system[0] = '\0';
+	app_template_config[0] = '\0';
+
+	const char * const cfgdir = BKE_appdir_folder_id(BLENDER_USER_CONFIG, NULL);
+	if (!use_factory_settings) {
+		if (cfgdir) {
+			BLI_path_join(filepath_startup, sizeof(filepath_startup), cfgdir, BLENDER_STARTUP_FILE, NULL);
+			BLI_path_join(filepath_userdef, sizeof(filepath_startup), cfgdir, BLENDER_USERPREF_FILE, NULL);
 		}
-		else if (cfgdir) {
-			BLI_make_file_string(G.main->name, startstr, cfgdir, BLENDER_STARTUP_FILE);
-			BLI_make_file_string(G.main->name, prefstr, cfgdir, BLENDER_USERPREF_FILE);
+		else {
+			use_factory_settings = true;
+		}
+
+		if (filepath_startup_override) {
+			BLI_strncpy(filepath_startup, filepath_startup_override, FILE_MAX);
+		}
+	}
+
+	/* load preferences before startup.blend */
+	if (!use_factory_settings && BLI_exists(filepath_userdef)) {
+		UserDef *userdef = BKE_blendfile_userdef_read(filepath_userdef, NULL);
+		if (userdef != NULL) {
+			BKE_blender_userdef_set_data(userdef);
+			MEM_freeN(userdef);
+
+			read_userdef_from_memory = false;
+			skip_flags |= BLO_READ_SKIP_USERDEF;
+			printf("Read prefs: %s\n", filepath_userdef);
+		}
+	}
+
+	const char *app_template = NULL;
+
+	if (filepath_startup_override != NULL) {
+		/* pass */
+	}
+	else if (app_template_override) {
+		app_template = app_template_override;
+	}
+	else if (!use_factory_settings && U.app_template[0]) {
+		app_template = U.app_template;
+	}
+
+	if (app_template != NULL) {
+		BKE_appdir_app_template_id_search(app_template, app_template_system, sizeof(app_template_system));
+		BLI_path_join(app_template_config, sizeof(app_template_config), cfgdir, app_template, NULL);
+	}
+
+	/* insert template name into startup file */
+	if (app_template != NULL) {
+		/* note that the path is being set even when 'use_factory_settings == true'
+		 * this is done so we can load a templates factory-settings */
+		if (!use_factory_settings) {
+			BLI_path_join(filepath_startup, sizeof(filepath_startup), app_template_config, BLENDER_STARTUP_FILE, NULL);
+			if (BLI_access(filepath_startup, R_OK) != 0) {
+				filepath_startup[0] = '\0';
+			}
 		}
 		else {
-			startstr[0] = '\0';
-			prefstr[0] = '\0';
-			from_memory = 1;
+			filepath_startup[0] = '\0';
+		}
+
+		if (filepath_startup[0] == '\0') {
+			BLI_path_join(filepath_startup, sizeof(filepath_startup), app_template_system, BLENDER_STARTUP_FILE, NULL);
 		}
 	}
-	
-	/* put aside screens to match with persistent windows later */
-	wm_window_match_init(C, &wmbase);
-	
-	if (!from_memory) {
-		if (BLI_access(startstr, R_OK) == 0) {
-			success = (BKE_blendfile_read(C, startstr, NULL) != BKE_BLENDFILE_READ_FAIL);
+
+	if (!use_factory_settings || (filepath_startup[0] != '\0')) {
+		if (BLI_access(filepath_startup, R_OK) == 0) {
+			success = (BKE_blendfile_read(C, filepath_startup, NULL, skip_flags) != BKE_BLENDFILE_READ_FAIL);
 		}
 		if (BLI_listbase_is_empty(&U.themes)) {
 			if (G.debug & G_DEBUG)
-				printf("\nNote: No (valid) '%s' found, fall back to built-in default.\n\n", startstr);
-			success = 0;
+				printf("\nNote: No (valid) '%s' found, fall back to built-in default.\n\n", filepath_startup);
+			success = false;
 		}
 	}
 
-	if (success == 0 && custom_file && reports) {
-		BKE_reportf(reports, RPT_ERROR, "Could not read '%s'", custom_file);
-		/*We can not return from here because wm is already reset*/
+	if (success == false && filepath_startup_override && reports) {
+		/* We can not return from here because wm is already reset */
+		BKE_reportf(reports, RPT_ERROR, "Could not read '%s'", filepath_startup_override);
 	}
 
-	if (success == 0) {
-		success = BKE_blendfile_read_from_memory(C, datatoc_startup_blend, datatoc_startup_blend_size, NULL, true);
+	if (success == false) {
+		success = BKE_blendfile_read_from_memory(
+		        C, datatoc_startup_blend, datatoc_startup_blend_size,
+		        NULL, skip_flags, true);
 		if (BLI_listbase_is_empty(&wmbase)) {
 			wm_clear_default_size(C);
 		}
-		BKE_tempdir_init(U.tempdir);
+	}
 
-#ifdef WITH_PYTHON_SECURITY
-		/* use alternative setting for security nuts
-		 * otherwise we'd need to patch the binary blob - startup.blend.c */
-		U.flag |= USER_SCRIPT_AUTOEXEC_DISABLE;
-#endif
+	if (use_empty_data) {
+		BKE_blendfile_read_make_empty(C);
 	}
-	
-	/* check new prefs only after startup.blend was finished */
-	if (!from_memory && BLI_exists(prefstr)) {
-		int done = BKE_blendfile_read_userdef(prefstr, NULL);
-		if (done != BKE_BLENDFILE_READ_FAIL) {
-			read_userdef_from_memory = false;
-			printf("Read new prefs: %s\n", prefstr);
+
+	/* Load template preferences,
+	 * unlike regular preferences we only use some of the settings,
+	 * see: BKE_blender_userdef_set_app_template */
+	if (app_template_system[0] != '\0') {
+		char temp_path[FILE_MAX];
+		temp_path[0] = '\0';
+		if (!use_factory_settings) {
+			BLI_path_join(temp_path, sizeof(temp_path), app_template_config, BLENDER_USERPREF_FILE, NULL);
+			if (BLI_access(temp_path, R_OK) != 0) {
+				temp_path[0] = '\0';
+			}
+		}
+
+		if (temp_path[0] == '\0') {
+			BLI_path_join(temp_path, sizeof(temp_path), app_template_system, BLENDER_USERPREF_FILE, NULL);
+		}
+
+		UserDef *userdef_template = NULL;
+		/* just avoids missing file warning */
+		if (BLI_exists(temp_path)) {
+			userdef_template = BKE_blendfile_userdef_read(temp_path, NULL);
+		}
+		if (userdef_template == NULL) {
+			/* we need to have preferences load to overwrite preferences from previous template */
+			userdef_template = BKE_blendfile_userdef_read_from_memory(
+			        datatoc_startup_blend, datatoc_startup_blend_size, NULL);
+		}
+		if (userdef_template) {
+			BKE_blender_userdef_set_app_template(userdef_template);
+			BKE_blender_userdef_free_data(userdef_template);
+			MEM_freeN(userdef_template);
 		}
 	}
-	
+
+	if (app_template_override) {
+		BLI_strncpy(U.app_template, app_template_override, sizeof(U.app_template));
+	}
+
 	/* prevent buggy files that had G_FILE_RELATIVE_REMAP written out by mistake. Screws up autosaves otherwise
 	 * can remove this eventually, only in a 2.53 and older, now its not written */
 	G.fileflags &= ~G_FILE_RELATIVE_REMAP;
@@ -744,11 +836,14 @@ int wm_homefile_read(bContext *C, ReportList *reports, bool from_memory, const c
 	G.main->name[0] = '\0';
 
 	/* When loading factory settings, the reset solid OpenGL lights need to be applied. */
-	if (!G.background) GPU_default_lights();
-	
-	/* XXX */
-	G.save_over = 0;    // start with save preference untitled.blend
-	G.fileflags &= ~G_FILE_AUTOPLAY;    /*  disable autoplay in startup.blend... */
+	if (!G.background) {
+		GPU_default_lights();
+	}
+
+	/* start with save preference untitled.blend */
+	G.save_over = 0;
+	/* disable auto-play in startup.blend... */
+	G.fileflags &= ~G_FILE_AUTOPLAY;
 
 	wm_file_read_post(C, true);
 
@@ -1264,6 +1359,13 @@ static int wm_homefile_write_exec(bContext *C, wmOperator *op)
 	char filepath[FILE_MAX];
 	int fileflags;
 
+	const char *app_template = U.app_template[0] ? U.app_template : NULL;
+	const char * const cfgdir = BKE_appdir_folder_id_create(BLENDER_USER_CONFIG, app_template);
+	if (cfgdir == NULL) {
+		BKE_report(op->reports, RPT_ERROR, "Unable to create user config path");
+		return OPERATOR_CANCELLED;
+	}
+
 	BLI_callback_exec(G.main, NULL, BLI_CB_EVT_SAVE_PRE);
 
 	/* check current window and close it if temp */
@@ -1273,7 +1375,8 @@ static int wm_homefile_write_exec(bContext *C, wmOperator *op)
 	/* update keymaps in user preferences */
 	WM_keyconfig_update(wm);
 
-	BLI_make_file_string("/", filepath, BKE_appdir_folder_id_create(BLENDER_USER_CONFIG, NULL), BLENDER_STARTUP_FILE);
+	BLI_path_join(filepath, sizeof(filepath), cfgdir, BLENDER_STARTUP_FILE, NULL);
+
 	printf("trying to save homefile at %s ", filepath);
 
 	ED_editors_flush_edits(C, false);
@@ -1351,21 +1454,44 @@ static int wm_userpref_write_exec(bContext *C, wmOperator *op)
 {
 	wmWindowManager *wm = CTX_wm_manager(C);
 	char filepath[FILE_MAX];
+	const char *cfgdir;
+	bool ok = false;
 
 	/* update keymaps in user preferences */
 	WM_keyconfig_update(wm);
 
-	BLI_make_file_string("/", filepath, BKE_appdir_folder_id_create(BLENDER_USER_CONFIG, NULL), BLENDER_USERPREF_FILE);
-	printf("trying to save userpref at %s ", filepath);
-
-	if (BKE_blendfile_write_userdef(filepath, op->reports) == 0) {
-		printf("fail\n");
-		return OPERATOR_CANCELLED;
+	if ((cfgdir = BKE_appdir_folder_id_create(BLENDER_USER_CONFIG, NULL))) {
+		BLI_path_join(filepath, sizeof(filepath), cfgdir, BLENDER_USERPREF_FILE, NULL);
+		printf("trying to save userpref at %s ", filepath);
+		if (BKE_blendfile_userdef_write(filepath, op->reports) != 0) {
+			printf("ok\n");
+			ok = true;
+		}
+		else {
+			printf("fail\n");
+		}
+	}
+	else {
+		BKE_report(op->reports, RPT_ERROR, "Unable to create userpref path");
 	}
 
-	printf("ok\n");
+	if (U.app_template[0] && (cfgdir = BKE_appdir_folder_id_create(BLENDER_USER_CONFIG, U.app_template))) {
+		/* Also save app-template prefs */
+		BLI_path_join(filepath, sizeof(filepath), cfgdir, BLENDER_USERPREF_FILE, NULL);
+		printf("trying to save app-template userpref at %s ", filepath);
+		if (BKE_blendfile_userdef_write(filepath, op->reports) == 0) {
+			printf("fail\n");
+			ok = true;
+		}
+		else {
+			printf("ok\n");
+		}
+	}
+	else if (U.app_template[0]) {
+		BKE_report(op->reports, RPT_ERROR, "Unable to create app-template userpref path");
+	}
 
-	return OPERATOR_FINISHED;
+	return ok ? OPERATOR_FINISHED : OPERATOR_CANCELLED;
 }
 
 void WM_OT_save_userpref(wmOperatorType *ot)
@@ -1400,11 +1526,11 @@ void WM_OT_read_history(wmOperatorType *ot)
 
 static int wm_homefile_read_exec(bContext *C, wmOperator *op)
 {
-	const bool from_memory = (STREQ(op->type->idname, "WM_OT_read_factory_settings"));
+	const bool use_factory_settings = (STREQ(op->type->idname, "WM_OT_read_factory_settings"));
 	char filepath_buf[FILE_MAX];
 	const char *filepath = NULL;
 
-	if (!from_memory) {
+	if (!use_factory_settings) {
 		PropertyRNA *prop = RNA_struct_find_property(op->ptr, "filepath");
 
 		/* This can be used when loading of a start-up file should only change
@@ -1426,7 +1552,34 @@ static int wm_homefile_read_exec(bContext *C, wmOperator *op)
 		G.fileflags &= ~G_FILE_NO_UI;
 	}
 
-	return wm_homefile_read(C, op->reports, from_memory, filepath) ? OPERATOR_FINISHED : OPERATOR_CANCELLED;
+	char app_template_buf[sizeof(U.app_template)];
+	const char *app_template;
+	PropertyRNA *prop_app_template = RNA_struct_find_property(op->ptr, "app_template");
+	const bool use_splash = !use_factory_settings && RNA_boolean_get(op->ptr, "use_splash");
+	const bool use_empty_data = RNA_boolean_get(op->ptr, "use_empty");
+
+	if (prop_app_template && RNA_property_is_set(op->ptr, prop_app_template)) {
+		RNA_property_string_get(op->ptr, prop_app_template, app_template_buf);
+		app_template = app_template_buf;
+	}
+	else if (!use_factory_settings) {
+		/* TODO: dont reset prefs on 'New File' */
+		BLI_strncpy(app_template_buf, U.app_template, sizeof(app_template_buf));
+		app_template = app_template_buf;
+	}
+	else {
+		app_template = NULL;
+	}
+
+	if (wm_homefile_read(C, op->reports, use_factory_settings, use_empty_data, filepath, app_template)) {
+		if (use_splash) {
+			WM_init_splash(C);
+		}
+		return OPERATOR_FINISHED;
+	}
+	else {
+		return OPERATOR_CANCELLED;
+	}
 }
 
 void WM_OT_read_homefile(wmOperatorType *ot)
@@ -1449,17 +1602,36 @@ void WM_OT_read_homefile(wmOperatorType *ot)
 	                       "Load user interface setup from the .blend file");
 	RNA_def_property_flag(prop, PROP_HIDDEN | PROP_SKIP_SAVE);
 
+	prop = RNA_def_boolean(ot->srna, "use_empty", false, "Empty", "");
+	RNA_def_property_flag(prop, PROP_HIDDEN | PROP_SKIP_SAVE);
+
+	/* So the splash can be kept open after loading a file (for templates). */
+	prop = RNA_def_boolean(ot->srna, "use_splash", false, "Splash", "");
+	RNA_def_property_flag(prop, PROP_HIDDEN | PROP_SKIP_SAVE);
+
+	prop = RNA_def_string(ot->srna, "app_template", "Template", sizeof(U.app_template), "", "");
+	RNA_def_property_flag(prop, PROP_HIDDEN | PROP_SKIP_SAVE);
+
 	/* omit poll to run in background mode */
 }
 
 void WM_OT_read_factory_settings(wmOperatorType *ot)
 {
+	PropertyRNA *prop;
+
 	ot->name = "Load Factory Settings";
 	ot->idname = "WM_OT_read_factory_settings";
 	ot->description = "Load default file and user preferences";
 
 	ot->invoke = WM_operator_confirm;
 	ot->exec = wm_homefile_read_exec;
+
+	prop = RNA_def_string(ot->srna, "app_template", "Template", sizeof(U.app_template), "", "");
+	RNA_def_property_flag(prop, PROP_HIDDEN | PROP_SKIP_SAVE);
+
+	prop = RNA_def_boolean(ot->srna, "use_empty", false, "Empty", "");
+	RNA_def_property_flag(prop, PROP_HIDDEN | PROP_SKIP_SAVE);
+
 	/* omit poll to run in background mode */
 }
 
diff --git a/source/blender/windowmanager/intern/wm_init_exit.c b/source/blender/windowmanager/intern/wm_init_exit.c
index c11c398c616..9bafe72d805 100644
--- a/source/blender/windowmanager/intern/wm_init_exit.c
+++ b/source/blender/windowmanager/intern/wm_init_exit.c
@@ -192,7 +192,7 @@ void WM_init(bContext *C, int argc, const char **argv)
 	wm_init_reports(C);
 
 	/* get the default database, plus a wm */
-	wm_homefile_read(C, NULL, G.factory_startup, NULL);
+	wm_homefile_read(C, NULL, G.factory_startup, false, NULL, NULL);
 	
 
 	BLT_lang_set(NULL);
@@ -444,8 +444,6 @@ void WM_exit_ext(bContext *C, const bool do_python)
 {
 	wmWindowManager *wm = C ? CTX_wm_manager(C) : NULL;
 
-	BKE_sound_exit();
-
 	/* first wrap up running stuff, we assume only the active WM is running */
 	/* modal handlers are on window level freed, others too? */
 	/* note; same code copied in wm_files.c */
@@ -574,7 +572,7 @@ void WM_exit_ext(bContext *C, const bool do_python)
 	ED_file_exit(); /* for fsmenu */
 
 	UI_exit();
-	BKE_blender_userdef_free();
+	BKE_blender_userdef_free_data(&U);
 
 	RNA_exit(); /* should be after BPY_python_end so struct python slots are cleared */
 	
@@ -591,6 +589,10 @@ void WM_exit_ext(bContext *C, const bool do_python)
 
 	BLI_threadapi_exit();
 
+	/* No need to call this early, rather do it late so that other pieces of Blender using sound may exit cleanly,
+	 * see also T50676. */
+	BKE_sound_exit();
+
 	BKE_blender_atexit();
 
 	if (MEM_get_memory_blocks_in_use() != 0) {
diff --git a/source/blender/windowmanager/intern/wm_keymap.c b/source/blender/windowmanager/intern/wm_keymap.c
index e201fa433d4..db5fc23146f 100644
--- a/source/blender/windowmanager/intern/wm_keymap.c
+++ b/source/blender/windowmanager/intern/wm_keymap.c
@@ -121,6 +121,13 @@ static void wm_keymap_item_properties_update_ot(wmKeyMapItem *kmi)
 			if (ot->srna != kmi->ptr->type) {
 				/* matches wm_keymap_item_properties_set but doesnt alloc new ptr */
 				WM_operator_properties_create_ptr(kmi->ptr, ot);
+				/* 'kmi->ptr->data' NULL'd above, keep using existing properties.
+				 * Note: the operators property types may have changed,
+				 * we will need a more comprehensive sanitize function to support this properly.
+				 */
+				if (kmi->properties) {
+					kmi->ptr->data = kmi->properties;
+				}
 				WM_operator_properties_sanitize(kmi->ptr, 1);
 			}
 		}
diff --git a/source/blender/windowmanager/intern/wm_operators.c b/source/blender/windowmanager/intern/wm_operators.c
index cef5e7e4a8e..d827ccafda8 100644
--- a/source/blender/windowmanager/intern/wm_operators.c
+++ b/source/blender/windowmanager/intern/wm_operators.c
@@ -900,7 +900,7 @@ void WM_operator_properties_create_ptr(PointerRNA *ptr, wmOperatorType *ot)
 
 void WM_operator_properties_create(PointerRNA *ptr, const char *opstring)
 {
-	wmOperatorType *ot = WM_operatortype_find(opstring, 0);
+	wmOperatorType *ot = WM_operatortype_find(opstring, false);
 
 	if (ot)
 		WM_operator_properties_create_ptr(ptr, ot);
@@ -1118,6 +1118,7 @@ static uiBlock *wm_enum_search_menu(bContext *C, ARegion *ar, void *arg_op)
 	block = UI_block_begin(C, ar, "_popup", UI_EMBOSS);
 	UI_block_flag_enable(block, UI_BLOCK_LOOP | UI_BLOCK_MOVEMOUSE_QUIT | UI_BLOCK_SEARCH_MENU);
 
+	search[0] = '\0';
 #if 0 /* ok, this isn't so easy... */
 	uiDefBut(block, UI_BTYPE_LABEL, 0, RNA_struct_ui_name(op->type->srna), 10, 10, UI_searchbox_size_x(), UI_UNIT_Y, NULL, 0.0, 0.0, 0, 0, "");
 #endif
@@ -1402,20 +1403,6 @@ static void dialog_exec_cb(bContext *C, void *arg1, void *arg2)
 	}
 }
 
-static void popup_check_cb(bContext *C, void *op_ptr, void *UNUSED(arg))
-{
-	wmOperator *op = op_ptr;
-	if (op->type->check) {
-		if (op->type->check(C, op)) {
-			/* check for popup and re-layout buttons */
-			ARegion *ar_menu = CTX_wm_menu(C);
-			if (ar_menu) {
-				ED_region_tag_refresh_ui(ar_menu);
-			}
-		}
-	}
-}
-
 /* Dialogs are popups that require user verification (click OK) before exec */
 static uiBlock *wm_block_dialog_create(bContext *C, ARegion *ar, void *userData)
 {
@@ -1434,8 +1421,6 @@ static uiBlock *wm_block_dialog_create(bContext *C, ARegion *ar, void *userData)
 
 	layout = UI_block_layout(block, UI_LAYOUT_VERTICAL, UI_LAYOUT_PANEL, 0, 0, data->width, data->height, 0, style);
 	
-	UI_block_func_set(block, popup_check_cb, op, NULL);
-
 	uiLayoutOperatorButs(C, layout, op, NULL, 'H', UI_LAYOUT_OP_SHOW_TITLE);
 	
 	/* clear so the OK button is left alone */
@@ -1474,8 +1459,6 @@ static uiBlock *wm_operator_ui_create(bContext *C, ARegion *ar, void *userData)
 
 	layout = UI_block_layout(block, UI_LAYOUT_VERTICAL, UI_LAYOUT_PANEL, 0, 0, data->width, data->height, 0, style);
 
-	UI_block_func_set(block, popup_check_cb, op, NULL);
-
 	/* since ui is defined the auto-layout args are not used */
 	uiLayoutOperatorButs(C, layout, op, NULL, 'V', 0);
 
@@ -1522,7 +1505,7 @@ int WM_operator_ui_popup(bContext *C, wmOperator *op, int width, int height)
 	data->width = width;
 	data->height = height;
 	data->free_op = true; /* if this runs and gets registered we may want not to free it */
-	UI_popup_block_ex(C, wm_operator_ui_create, NULL, wm_operator_ui_popup_cancel, data);
+	UI_popup_block_ex(C, wm_operator_ui_create, NULL, wm_operator_ui_popup_cancel, data, op);
 	return OPERATOR_RUNNING_MODAL;
 }
 
@@ -1552,7 +1535,7 @@ static int wm_operator_props_popup_ex(bContext *C, wmOperator *op,
 	if (!do_redo || !(U.uiflag & USER_GLOBALUNDO))
 		return WM_operator_props_dialog_popup(C, op, 15 * UI_UNIT_X, UI_UNIT_Y);
 
-	UI_popup_block_ex(C, wm_block_create_redo, NULL, wm_block_redo_cancel_cb, op);
+	UI_popup_block_ex(C, wm_block_create_redo, NULL, wm_block_redo_cancel_cb, op, op);
 
 	if (do_call)
 		wm_block_redo_cb(C, op, 0);
@@ -1594,7 +1577,7 @@ int WM_operator_props_dialog_popup(bContext *C, wmOperator *op, int width, int h
 	data->free_op = true; /* if this runs and gets registered we may want not to free it */
 
 	/* op is not executed until popup OK but is clicked */
-	UI_popup_block_ex(C, wm_block_dialog_create, wm_operator_ui_popup_ok, wm_operator_ui_popup_cancel, data);
+	UI_popup_block_ex(C, wm_block_dialog_create, wm_operator_ui_popup_ok, wm_operator_ui_popup_cancel, data, op);
 
 	return OPERATOR_RUNNING_MODAL;
 }
@@ -1761,6 +1744,36 @@ static uiBlock *wm_block_create_splash(bContext *C, ARegion *ar, void *UNUSED(ar
 		ibuf = IMB_ibImageFromMemory((unsigned char *)datatoc_splash_png,
 		                             datatoc_splash_png_size, IB_rect, NULL, "<splash screen>");
 	}
+
+	/* overwrite splash with template image */
+	if (U.app_template[0] != '\0') {
+		ImBuf *ibuf_template = NULL;
+		char splash_filepath[FILE_MAX];
+		char template_directory[FILE_MAX];
+
+		if (BKE_appdir_app_template_id_search(
+		        U.app_template,
+		        template_directory, sizeof(template_directory)))
+		{
+			BLI_join_dirfile(
+			        splash_filepath, sizeof(splash_filepath), template_directory,
+			        (U.pixelsize == 2) ? "splash_2x.png" : "splash.png");
+			ibuf_template = IMB_loadiffname(splash_filepath, IB_rect, NULL);
+			if (ibuf_template) {
+				const int x_expect = ibuf->x;
+				const int y_expect = 230 * (int)U.pixelsize;
+				/* don't cover the header text */
+				if (ibuf_template->x == x_expect && ibuf_template->y == y_expect) {
+					memcpy(ibuf->rect, ibuf_template->rect, ibuf_template->x * ibuf_template->y * sizeof(char[4]));
+				}
+				else {
+					printf("Splash expected %dx%d found %dx%d, ignoring: %s\n",
+					       x_expect, y_expect, ibuf_template->x, ibuf_template->y, splash_filepath);
+				}
+				IMB_freeImBuf(ibuf_template);
+			}
+		}
+	}
 #endif
 
 	block = UI_block_begin(C, ar, "_popup", UI_EMBOSS);
diff --git a/source/blender/windowmanager/intern/wm_stereo.c b/source/blender/windowmanager/intern/wm_stereo.c
index 46cee907991..66ebf18c9e1 100644
--- a/source/blender/windowmanager/intern/wm_stereo.c
+++ b/source/blender/windowmanager/intern/wm_stereo.c
@@ -345,6 +345,32 @@ bool WM_stereo3d_enabled(wmWindow *win, bool skip_stereo3d_check)
 	return true;
 }
 
+/**
+ * If needed, this adjusts \a r_mouse_xy so that drawn cursor and handled mouse position are matching visually.
+*/
+void wm_stereo3d_mouse_offset_apply(wmWindow *win, int *r_mouse_xy)
+{
+	if (!WM_stereo3d_enabled(win, false))
+		return;
+
+	if (win->stereo3d_format->display_mode == S3D_DISPLAY_SIDEBYSIDE) {
+		const int half_x = win->sizex / 2;
+		/* right half of the screen */
+		if (r_mouse_xy[0] > half_x) {
+			r_mouse_xy[0] -= half_x;
+		}
+		r_mouse_xy[0] *= 2;
+	}
+	else if (win->stereo3d_format->display_mode == S3D_DISPLAY_TOPBOTTOM) {
+		const int half_y = win->sizey / 2;
+		/* upper half of the screen */
+		if (r_mouse_xy[1] > half_y) {
+			r_mouse_xy[1] -= half_y;
+		}
+		r_mouse_xy[1] *= 2;
+	}
+}
+
 /************************** Stereo 3D operator **********************************/
 typedef struct Stereo3dData {
 	Stereo3dFormat stereo3d_format;
diff --git a/source/blender/windowmanager/intern/wm_window.c b/source/blender/windowmanager/intern/wm_window.c
index 2d43c47679d..aaf77946412 100644
--- a/source/blender/windowmanager/intern/wm_window.c
+++ b/source/blender/windowmanager/intern/wm_window.c
@@ -77,6 +77,7 @@
 #include "GPU_extensions.h"
 #include "GPU_init_exit.h"
 #include "GPU_glew.h"
+#include "BLF_api.h"
 
 /* for assert */
 #ifndef NDEBUG
@@ -311,7 +312,7 @@ void wm_window_close(bContext *C, wmWindowManager *wm, wmWindow *win)
 	if (tmpwin == NULL)
 		do_exit = 1;
 	
-	if ((U.uiflag & USER_QUIT_PROMPT) && !wm->file_saved) {
+	if ((U.uiflag & USER_QUIT_PROMPT) && !wm->file_saved && !G.background) {
 		if (do_exit) {
 			if (!GHOST_confirmQuit(win->ghostwin))
 				return;
@@ -374,14 +375,39 @@ void wm_window_title(wmWindowManager *wm, wmWindow *win)
 	}
 }
 
-static float wm_window_get_virtual_pixelsize(void)
+static void wm_window_set_dpi(wmWindow *win)
 {
-	return ((U.virtual_pixel == VIRTUAL_PIXEL_NATIVE) ? 1.0f : 2.0f);
-}
+	int auto_dpi = GHOST_GetDPIHint(win->ghostwin);
 
-float wm_window_pixelsize(wmWindow *win)
-{
-	return (GHOST_GetNativePixelSize(win->ghostwin) * wm_window_get_virtual_pixelsize());
+	/* Lazily init UI scale size, preserving backwards compatibility by
+	 * computing UI scale from ratio of previous DPI and auto DPI */
+	if (U.ui_scale == 0) {
+		int virtual_pixel = (U.virtual_pixel == VIRTUAL_PIXEL_NATIVE) ? 1 : 2;
+
+		if (U.dpi == 0) {
+			U.ui_scale = virtual_pixel;
+		}
+		else {
+			U.ui_scale = (virtual_pixel * U.dpi * 96.0f) / (auto_dpi * 72.0f);
+		}
+
+		CLAMP(U.ui_scale, 0.25f, 4.0f);
+	}
+
+	/* Blender's UI drawing assumes DPI 72 as a good default following macOS
+	 * while Windows and Linux use DPI 96. GHOST assumes a default 96 so we
+	 * remap the DPI to Blender's convention. */
+	int dpi = auto_dpi * U.ui_scale * (72.0 / 96.0f);
+
+	/* Automatically set larger pixel size for high DPI. */
+	int pixelsize = MAX2(1, dpi / 54);
+
+	/* Set user preferences globals for drawing, and for forward compatibility. */
+	U.pixelsize = GHOST_GetNativePixelSize(win->ghostwin) * pixelsize;
+	U.dpi = dpi / pixelsize;
+	U.virtual_pixel = (pixelsize == 1) ? VIRTUAL_PIXEL_NATIVE : VIRTUAL_PIXEL_DOUBLE;
+
+	BKE_blender_userdef_refresh();
 }
 
 /* belongs to below */
@@ -456,10 +482,8 @@ static void wm_window_ghostwindow_add(wmWindowManager *wm, const char *title, wm
 			glClear(GL_COLOR_BUFFER_BIT);
 		}
 		
-		/* displays with larger native pixels, like Macbook. Used to scale dpi with */
 		/* needed here, because it's used before it reads userdef */
-		U.pixelsize = wm_window_pixelsize(win);
-		BKE_blender_userdef_refresh();
+		wm_window_set_dpi(win);
 		
 		wm_window_swap_buffers(win);
 		
@@ -626,7 +650,6 @@ wmWindow *WM_window_open_temp(bContext *C, const rcti *rect_init, int type)
 	Scene *scene = CTX_data_scene(C);
 	const char *title;
 	rcti rect = *rect_init;
-	const short px_virtual = (short)wm_window_get_virtual_pixelsize();
 
 	/* changes rect to fit within desktop */
 	wm_window_check_position(&rect);
@@ -644,9 +667,8 @@ wmWindow *WM_window_open_temp(bContext *C, const rcti *rect_init, int type)
 		win->posy = rect.ymin;
 	}
 
-	/* multiply with virtual pixelsize, ghost handles native one (e.g. for retina) */
-	win->sizex = BLI_rcti_size_x(&rect) * px_virtual;
-	win->sizey = BLI_rcti_size_y(&rect) * px_virtual;
+	win->sizex = BLI_rcti_size_x(&rect);
+	win->sizey = BLI_rcti_size_y(&rect);
 
 	if (win->ghostwin) {
 		wm_window_set_size(win, win->sizex, win->sizey);
@@ -835,8 +857,7 @@ void wm_window_make_drawable(wmWindowManager *wm, wmWindow *win)
 		GHOST_ActivateWindowDrawingContext(win->ghostwin);
 		
 		/* this can change per window */
-		U.pixelsize = wm_window_pixelsize(win);
-		BKE_blender_userdef_refresh();
+		wm_window_set_dpi(win);
 	}
 }
 
@@ -1035,6 +1056,8 @@ static int ghost_event_proc(GHOST_EventHandle evt, GHOST_TUserDataPtr C_void_ptr
 				if (type == GHOST_kEventWindowSize) {
 					WM_jobs_stop(wm, win->screen, NULL);
 				}
+
+				wm_window_set_dpi(win);
 				
 				/* win32: gives undefined window size when minimized */
 				if (state != GHOST_kWindowStateMinimized) {
@@ -1118,7 +1141,19 @@ static int ghost_event_proc(GHOST_EventHandle evt, GHOST_TUserDataPtr C_void_ptr
 				}
 				break;
 			}
-				
+
+			case GHOST_kEventWindowDPIHintChanged:
+			{
+				wm_window_set_dpi(win);
+				/* font's are stored at each DPI level, without this we can easy load 100's of fonts */
+				BLF_cache_clear();
+
+				BKE_blender_userdef_refresh();
+				WM_main_add_notifier(NC_WINDOW, NULL);      /* full redraw */
+				WM_main_add_notifier(NC_SCREEN | NA_EDITED, NULL);    /* refresh region sizes */
+				break;
+			}
+
 			case GHOST_kEventOpenMainFile:
 			{
 				PointerRNA props_ptr;
@@ -1199,11 +1234,9 @@ static int ghost_event_proc(GHOST_EventHandle evt, GHOST_TUserDataPtr C_void_ptr
 			{
 				// only update if the actual pixel size changes
 				float prev_pixelsize = U.pixelsize;
-				U.pixelsize = wm_window_pixelsize(win);
+				wm_window_set_dpi(win);
 
 				if (U.pixelsize != prev_pixelsize) {
-					BKE_blender_userdef_refresh();
-
 					// close all popups since they are positioned with the pixel
 					// size baked in and it's difficult to correct them
 					wmWindow *oldWindow = CTX_wm_window(C);
diff --git a/source/blender/windowmanager/wm.h b/source/blender/windowmanager/wm.h
index 2f06ddab1e8..e8485359490 100644
--- a/source/blender/windowmanager/wm.h
+++ b/source/blender/windowmanager/wm.h
@@ -78,6 +78,7 @@ void wm_autosave_location(char *filepath);
 
 /* wm_stereo.c */
 void wm_method_draw_stereo3d(const bContext *C, wmWindow *win);
+void wm_stereo3d_mouse_offset_apply(wmWindow *win, int *r_mouse_xy);
 int wm_stereo3d_set_exec(bContext *C, wmOperator *op);
 int wm_stereo3d_set_invoke(bContext *C, wmOperator *op, const wmEvent *event);
 void wm_stereo3d_set_draw(bContext *C, wmOperator *op);
diff --git a/source/blender/windowmanager/wm_files.h b/source/blender/windowmanager/wm_files.h
index 396907a3f6d..9a1518e15b0 100644
--- a/source/blender/windowmanager/wm_files.h
+++ b/source/blender/windowmanager/wm_files.h
@@ -35,7 +35,10 @@ struct wmOperatorType;
 
 /* wm_files.c */
 void		wm_history_file_read(void);
-int			wm_homefile_read(struct bContext *C, struct ReportList *reports, bool from_memory, const char *filepath);
+int			wm_homefile_read(
+        struct bContext *C, struct ReportList *reports,
+        bool use_factory_settings, bool use_empty_data,
+        const char *filepath_startup_override, const char *app_template_override);
 void		wm_file_read_report(bContext *C);
 
 void        WM_OT_save_homefile(struct wmOperatorType *ot);
diff --git a/source/blender/windowmanager/wm_window.h b/source/blender/windowmanager/wm_window.h
index c106f9d7851..f70ec6b47f6 100644
--- a/source/blender/windowmanager/wm_window.h
+++ b/source/blender/windowmanager/wm_window.h
@@ -63,8 +63,6 @@ void		wm_window_swap_buffers	(wmWindow *win);
 void		wm_window_set_swap_interval(wmWindow *win, int interval);
 bool		wm_window_get_swap_interval(wmWindow *win, int *intervalOut);
 
-float		wm_window_pixelsize(wmWindow *win);
-
 void		wm_get_cursor_position			(wmWindow *win, int *x, int *y);
 void		wm_cursor_position_from_ghost	(wmWindow *win, int *x, int *y);
 void		wm_cursor_position_to_ghost		(wmWindow *win, int *x, int *y);
diff --git a/source/blenderplayer/bad_level_call_stubs/stubs.c b/source/blenderplayer/bad_level_call_stubs/stubs.c
index 08d00e9148a..d659f5c1a8a 100644
--- a/source/blenderplayer/bad_level_call_stubs/stubs.c
+++ b/source/blenderplayer/bad_level_call_stubs/stubs.c
@@ -306,6 +306,7 @@ int WM_operator_confirm(struct bContext *C, struct wmOperator *op, const struct
 struct MenuType *WM_menutype_find(const char *idname, bool quiet) RET_NULL
 void WM_operator_stack_clear(struct wmWindowManager *wm) RET_NONE
 void WM_operator_handlers_clear(wmWindowManager *wm, struct wmOperatorType *ot) RET_NONE
+bool WM_operator_is_repeat(const struct bContext *C, const struct wmOperator *op) RET_ZERO;
 
 void WM_autosave_init(wmWindowManager *wm) RET_NONE
 void WM_jobs_kill_all_except(struct wmWindowManager *wm, void *owner) RET_NONE
@@ -736,7 +737,8 @@ int collada_export(struct Scene *sce,
                    int use_blender_profile,
                    int sort_by_name,
                    BC_export_transformation_type export_transformation_type,
-                   int open_sim) RET_ZERO
+                   int open_sim,
+                   int keep_bind_info) RET_ZERO
 
 void ED_mesh_calc_tessface(struct Mesh *mesh, bool free_mpoly) RET_NONE
 
diff --git a/source/creator/creator_args.c b/source/creator/creator_args.c
index 27579e58dba..658a0b2db08 100644
--- a/source/creator/creator_args.c
+++ b/source/creator/creator_args.c
@@ -946,7 +946,7 @@ static int arg_handle_native_pixels_set(int UNUSED(argc), const char **UNUSED(ar
 }
 
 static const char arg_handle_with_borders_doc[] =
-"\n\tForce opening without borders"
+"\n\tForce opening with borders"
 ;
 static int arg_handle_with_borders(int UNUSED(argc), const char **UNUSED(argv), void *UNUSED(data))
 {
@@ -1364,7 +1364,7 @@ static int arg_handle_render_frame(int argc, const char **argv, void *data)
 
 			re = RE_NewRender(scene->id.name);
 			BLI_begin_threaded_malloc();
-			BKE_reports_init(&reports, RPT_PRINT);
+			BKE_reports_init(&reports, RPT_STORE);
 
 			RE_SetReports(re, &reports);
 			for (int i = 0; i < frames_range_len; i++) {
@@ -1379,6 +1379,7 @@ static int arg_handle_render_frame(int argc, const char **argv, void *data)
 				}
 			}
 			RE_SetReports(re, NULL);
+			BKE_reports_clear(&reports);
 			BLI_end_threaded_malloc();
 			MEM_freeN(frame_range_arr);
 			return 1;
@@ -1406,10 +1407,11 @@ static int arg_handle_render_animation(int UNUSED(argc), const char **UNUSED(arg
 		Render *re = RE_NewRender(scene->id.name);
 		ReportList reports;
 		BLI_begin_threaded_malloc();
-		BKE_reports_init(&reports, RPT_PRINT);
+		BKE_reports_init(&reports, RPT_STORE);
 		RE_SetReports(re, &reports);
 		RE_BlenderAnim(re, bmain, scene, NULL, scene->lay, scene->r.sfra, scene->r.efra, scene->r.frame_step);
 		RE_SetReports(re, NULL);
+		BKE_reports_clear(&reports);
 		BLI_end_threaded_malloc();
 	}
 	else {
diff --git a/source/gameengine/BlenderRoutines/BL_KetsjiEmbedStart.cpp b/source/gameengine/BlenderRoutines/BL_KetsjiEmbedStart.cpp
index d7fe8d8ce59..6c3751ae34d 100644
--- a/source/gameengine/BlenderRoutines/BL_KetsjiEmbedStart.cpp
+++ b/source/gameengine/BlenderRoutines/BL_KetsjiEmbedStart.cpp
@@ -104,13 +104,13 @@ typedef void * wmUIHandlerRemoveFunc;
 #  include AUD_DEVICE_H
 #endif
 
-static BlendFileData *load_game_data(char *filename)
+static BlendFileData *load_game_data(const char *filename)
 {
 	ReportList reports;
 	BlendFileData *bfd;
 	
 	BKE_reports_init(&reports, RPT_STORE);
-	bfd= BLO_read_from_file(filename, &reports);
+	bfd= BLO_read_from_file(filename, &reports, BLO_READ_SKIP_USERDEF);
 
 	if (!bfd) {
 		printf("Loading %s failed: ", filename);
diff --git a/source/gameengine/GamePlayer/ghost/GPG_ghost.cpp b/source/gameengine/GamePlayer/ghost/GPG_ghost.cpp
index dd93a6ff424..0c206dfce3d 100644
--- a/source/gameengine/GamePlayer/ghost/GPG_ghost.cpp
+++ b/source/gameengine/GamePlayer/ghost/GPG_ghost.cpp
@@ -355,7 +355,7 @@ static BlendFileData *load_game_data(const char *progname, char *filename = NULL
 			BLI_strncpy(bfd->main->name, progname, sizeof(bfd->main->name));
 		}
 	} else {
-		bfd= BLO_read_from_file(progname, &reports);
+		bfd= BLO_read_from_file(progname, &reports, BLO_READ_SKIP_NONE);
 	}
 	
 	if (!bfd && filename) {
diff --git a/source/gameengine/VideoTexture/FilterColor.cpp b/source/gameengine/VideoTexture/FilterColor.cpp
index eed84a8580c..15a7e9e4cd1 100644
--- a/source/gameengine/VideoTexture/FilterColor.cpp
+++ b/source/gameengine/VideoTexture/FilterColor.cpp
@@ -68,7 +68,7 @@ PyTypeObject FilterGrayType =
 	0,                         /*tp_setattro*/
 	0,                         /*tp_as_buffer*/
 	Py_TPFLAGS_DEFAULT,        /*tp_flags*/
-	"Filter for gray scale effect",       /* tp_doc */
+	"Filter for grayscale effect",       /* tp_doc */
 	0,		               /* tp_traverse */
 	0,		               /* tp_clear */
 	0,		               /* tp_richcompare */
diff --git a/source/gameengine/VideoTexture/FilterColor.h b/source/gameengine/VideoTexture/FilterColor.h
index 350f7270874..d042863d7e8 100644
--- a/source/gameengine/VideoTexture/FilterColor.h
+++ b/source/gameengine/VideoTexture/FilterColor.h
@@ -36,7 +36,7 @@
 #include "FilterBase.h"
 
 
-/// pixel filter for gray scale
+/// pixel filter for grayscale
 class FilterGray : public FilterBase
 {
 public:
@@ -53,7 +53,7 @@ protected:
 		// calculate gray value
 		unsigned int gray = (28 * (VT_B(val)) + 151 * (VT_G(val))
 			+ 77 * (VT_R(val))) >> 8;
-		// return gray scale value
+		// return grayscale value
 		VT_R(val) = gray;
 		VT_G(val) = gray;
 		VT_B(val) = gray;
diff --git a/tests/gtests/blenlib/BLI_array_store_test.cc b/tests/gtests/blenlib/BLI_array_store_test.cc
index 5af6e639e64..370a4111bae 100644
--- a/tests/gtests/blenlib/BLI_array_store_test.cc
+++ b/tests/gtests/blenlib/BLI_array_store_test.cc
@@ -36,15 +36,15 @@ static void print_mem_saved(const char *id, const BArrayStore *bs)
 /* -------------------------------------------------------------------- */
 /* Test Chunks (building data from list of chunks) */
 
-typedef struct TestChunnk {
-	struct TestChunnk *next, *prev;
+typedef struct TestChunk {
+	struct TestChunk *next, *prev;
 	const void *data;
 	size_t data_len;
-} TestChunnk;
+} TestChunk;
 
-static TestChunnk *testchunk_list_add(ListBase *lb, const void *data, size_t data_len)
+static TestChunk *testchunk_list_add(ListBase *lb, const void *data, size_t data_len)
 {
-	TestChunnk *tc = (TestChunnk *)MEM_mallocN(sizeof(*tc), __func__);
+	TestChunk *tc = (TestChunk *)MEM_mallocN(sizeof(*tc), __func__);
 	tc->data = data;
 	tc->data_len = data_len;
 	BLI_addtail(lb, tc);
@@ -53,7 +53,7 @@ static TestChunnk *testchunk_list_add(ListBase *lb, const void *data, size_t dat
 }
 
 #if 0
-static TestChunnk *testchunk_list_add_copydata(ListBase *lb, const void *data, size_t data_len)
+static TestChunk *testchunk_list_add_copydata(ListBase *lb, const void *data, size_t data_len)
 {
 	void *data_copy = MEM_mallocN(data_len, __func__);
 	memcpy(data_copy, data, data_len);
@@ -63,7 +63,7 @@ static TestChunnk *testchunk_list_add_copydata(ListBase *lb, const void *data, s
 
 static void testchunk_list_free(ListBase *lb)
 {
-	for (TestChunnk *tc = (TestChunnk *)lb->first, *tb_next; tc; tc = tb_next) {
+	for (TestChunk *tc = (TestChunk *)lb->first, *tb_next; tc; tc = tb_next) {
 		tb_next = tc->next;
 		MEM_freeN((void *)tc->data);
 		MEM_freeN(tc);
@@ -77,12 +77,12 @@ static char *testchunk_as_data(
         size_t *r_data_len)
 {
 	size_t data_len = 0;
-	for (TestChunnk *tc = (TestChunnk *)lb->first; tc; tc = tc->next) {
+	for (TestChunk *tc = (TestChunk *)lb->first; tc; tc = tc->next) {
 		data_len += tc->data_len;
 	}
 	char *data = (char *)MEM_mallocN(data_len, __func__);
 	size_t i = 0;
-	for (TestChunnk *tc = (TestChunnk *)lb->first; tc; tc = tc->next) {
+	for (TestChunk *tc = (TestChunk *)lb->first; tc; tc = tc->next) {
 		memcpy(&data[i], tc->data, tc->data_len);
 		data_len += tc->data_len;
 		i += tc->data_len;
@@ -95,7 +95,7 @@ static char *testchunk_as_data(
 #endif
 
 static char *testchunk_as_data_array(
-        TestChunnk **tc_array, int tc_array_len,
+        TestChunk **tc_array, int tc_array_len,
         size_t *r_data_len)
 {
 	size_t data_len = 0;
@@ -105,7 +105,7 @@ static char *testchunk_as_data_array(
 	char *data = (char *)MEM_mallocN(data_len, __func__);
 	size_t i = 0;
 	for (int tc_index = 0; tc_index < tc_array_len; tc_index++) {
-		TestChunnk *tc = tc_array[tc_index];
+		TestChunk *tc = tc_array[tc_index];
 		memcpy(&data[i], tc->data, tc->data_len);
 		i += tc->data_len;
 	}
@@ -677,9 +677,9 @@ static void random_chunk_mutate_helper(
 	ListBase random_chunks;
 	BLI_listbase_clear(&random_chunks);
 	random_chunk_generate(&random_chunks, chunks_per_buffer, stride, chunk_count, random_seed);
-	TestChunnk **chunks_array = (TestChunnk **)MEM_mallocN(chunks_per_buffer * sizeof(TestChunnk *), __func__);
+	TestChunk **chunks_array = (TestChunk **)MEM_mallocN(chunks_per_buffer * sizeof(TestChunk *), __func__);
 	{
-		TestChunnk *tc = (TestChunnk *)random_chunks.first;
+		TestChunk *tc = (TestChunk *)random_chunks.first;
 		for (int i = 0; i < chunks_per_buffer; i++, tc = tc->next) {
 			chunks_array[i] = tc;
 		}
@@ -692,7 +692,7 @@ static void random_chunk_mutate_helper(
 	{
 		RNG *rng = BLI_rng_new(random_seed);
 		for (int i = 0; i < items_total; i++) {
-			BLI_rng_shuffle_array(rng, chunks_array, sizeof(TestChunnk *), chunks_per_buffer);
+			BLI_rng_shuffle_array(rng, chunks_array, sizeof(TestChunk *), chunks_per_buffer);
 			size_t data_len;
 			char *data = testchunk_as_data_array(chunks_array, chunks_per_buffer, &data_len);
 			BLI_assert(data_len == chunks_per_buffer * chunk_count * stride);
diff --git a/tests/gtests/blenlib/BLI_path_util_test.cc b/tests/gtests/blenlib/BLI_path_util_test.cc
index d017ab18b4d..ef469da50b2 100644
--- a/tests/gtests/blenlib/BLI_path_util_test.cc
+++ b/tests/gtests/blenlib/BLI_path_util_test.cc
@@ -5,6 +5,7 @@
 extern "C" {
 #include "BLI_fileops.h"
 #include "BLI_path_util.h"
+#include "BLI_string.h"
 #include "../../../source/blender/imbuf/IMB_imbuf.h"
 
 #ifdef _WIN32
@@ -57,7 +58,7 @@ char *zLhm65070058860608_br_find_exe(const char *default_exe)
 
 /* BLI_cleanup_path */
 #ifndef _WIN32
-TEST(path_util, PathUtilClean)
+TEST(path_util, Clean)
 {
 	/* "/./" -> "/" */
 	{
@@ -113,8 +114,238 @@ TEST(path_util, PathUtilClean)
 }
 #endif
 
+
+#define AT_INDEX(str_input, index_input, str_expect) \
+	{ \
+		char path[] = str_input; \
+		const char *expect = str_expect; \
+		int index_output, len_output; \
+		const bool ret = BLI_path_name_at_index(path, index_input, &index_output, &len_output); \
+		if (expect == NULL) { \
+			EXPECT_EQ(ret, false); \
+		} \
+		else { \
+			EXPECT_EQ(ret, true); \
+			EXPECT_EQ(strlen(expect), len_output); \
+			path[index_output + len_output] = '\0'; \
+			EXPECT_STREQ(&path[index_output], expect); \
+		} \
+	}((void)0)
+
+/* BLI_path_name_at_index */
+TEST(path_util, NameAtIndex_Single)
+{
+	AT_INDEX("/a", 0, "a");
+	AT_INDEX("/a/", 0, "a");
+	AT_INDEX("a/", 0, "a");
+	AT_INDEX("//a//", 0, "a");
+	AT_INDEX("a/b", 0, "a");
+
+	AT_INDEX("/a", 1, NULL);
+	AT_INDEX("/a/", 1, NULL);
+	AT_INDEX("a/", 1, NULL);
+	AT_INDEX("//a//", 1, NULL);
+}
+TEST(path_util, NameAtIndex_SingleNeg)
+{
+	AT_INDEX("/a", -1, "a");
+	AT_INDEX("/a/", -1, "a");
+	AT_INDEX("a/", -1, "a");
+	AT_INDEX("//a//", -1, "a");
+	AT_INDEX("a/b", -1, "b");
+
+	AT_INDEX("/a", -2, NULL);
+	AT_INDEX("/a/", -2, NULL);
+	AT_INDEX("a/", -2, NULL);
+	AT_INDEX("//a//", -2, NULL);
+}
+
+TEST(path_util, NameAtIndex_Double)
+{
+	AT_INDEX("/ab", 0, "ab");
+	AT_INDEX("/ab/", 0, "ab");
+	AT_INDEX("ab/", 0, "ab");
+	AT_INDEX("//ab//", 0, "ab");
+	AT_INDEX("ab/c", 0, "ab");
+
+	AT_INDEX("/ab", 1, NULL);
+	AT_INDEX("/ab/", 1, NULL);
+	AT_INDEX("ab/", 1, NULL);
+	AT_INDEX("//ab//", 1, NULL);
+}
+
+TEST(path_util, NameAtIndex_DoublNeg)
+{
+	AT_INDEX("/ab", -1, "ab");
+	AT_INDEX("/ab/", -1, "ab");
+	AT_INDEX("ab/", -1, "ab");
+	AT_INDEX("//ab//", -1, "ab");
+	AT_INDEX("ab/c", -1, "c");
+
+	AT_INDEX("/ab", -2, NULL);
+	AT_INDEX("/ab/", -2, NULL);
+	AT_INDEX("ab/", -2, NULL);
+	AT_INDEX("//ab//", -2, NULL);
+}
+
+TEST(path_util, NameAtIndex_Misc)
+{
+	AT_INDEX("/how/now/brown/cow", 0, "how");
+	AT_INDEX("/how/now/brown/cow", 1, "now");
+	AT_INDEX("/how/now/brown/cow", 2, "brown");
+	AT_INDEX("/how/now/brown/cow", 3, "cow");
+	AT_INDEX("/how/now/brown/cow", 4, NULL);
+	AT_INDEX("/how/now/brown/cow/", 4, NULL);
+}
+
+TEST(path_util, NameAtIndex_MiscNeg)
+{
+	AT_INDEX("/how/now/brown/cow", 0, "how");
+	AT_INDEX("/how/now/brown/cow", 1, "now");
+	AT_INDEX("/how/now/brown/cow", 2, "brown");
+	AT_INDEX("/how/now/brown/cow", 3, "cow");
+	AT_INDEX("/how/now/brown/cow", 4, NULL);
+	AT_INDEX("/how/now/brown/cow/", 4, NULL);
+}
+
+TEST(path_util, NameAtIndex_MiscComplex)
+{
+	AT_INDEX("how//now/brown/cow", 0, "how");
+	AT_INDEX("//how///now\\/brown/cow", 1, "now");
+	AT_INDEX("/how/now\\//brown\\/cow", 2, "brown");
+	AT_INDEX("/how/now/brown/cow//\\", 3, "cow");
+	AT_INDEX("/how/now/brown/\\cow", 4, NULL);
+	AT_INDEX("how/now/brown/\\cow\\", 4, NULL);
+}
+
+TEST(path_util, NameAtIndex_MiscComplexNeg)
+{
+	AT_INDEX("how//now/brown/cow", -4, "how");
+	AT_INDEX("//how///now\\/brown/cow", -3, "now");
+	AT_INDEX("/how/now\\//brown\\/cow", -2, "brown");
+	AT_INDEX("/how/now/brown/cow//\\", -1, "cow");
+	AT_INDEX("/how/now/brown/\\cow", -5, NULL);
+	AT_INDEX("how/now/brown/\\cow\\", -5, NULL);
+}
+
+TEST(path_util, NameAtIndex_NoneComplex)
+{
+	AT_INDEX("", 0, NULL);
+	AT_INDEX("/", 0, NULL);
+	AT_INDEX("//", 0, NULL);
+	AT_INDEX("///", 0, NULL);
+}
+
+TEST(path_util, NameAtIndex_NoneComplexNeg)
+{
+	AT_INDEX("", -1, NULL);
+	AT_INDEX("/", -1, NULL);
+	AT_INDEX("//", -1, NULL);
+	AT_INDEX("///", -1, NULL);
+}
+
+#undef AT_INDEX
+
+#define JOIN(str_expect, out_size, ...) \
+	{ \
+		const char *expect = str_expect; \
+		char result[(out_size) + 1024]; \
+		/* check we don't write past the last byte */ \
+		result[out_size] = '\0'; \
+		BLI_path_join(result, out_size, __VA_ARGS__, NULL); \
+		/* simplify expected string */ \
+		BLI_str_replace_char(result, '\\', '/'); \
+		EXPECT_STREQ(result, expect); \
+		EXPECT_EQ(result[out_size], '\0'); \
+	} ((void)0)
+
+/* BLI_path_join */
+TEST(path_util, JoinNop)
+{
+	JOIN("", 100, "");
+	JOIN("", 100, "", "");
+	JOIN("", 100, "", "", "");
+	JOIN("/", 100, "/", "", "");
+	JOIN("/", 100, "/", "/");
+	JOIN("/", 100, "/", "", "/");
+	JOIN("/", 100, "/", "", "/", "");
+}
+
+TEST(path_util, JoinSingle)
+{
+	JOIN("test", 100, "test");
+	JOIN("", 100, "");
+	JOIN("a", 100, "a");
+	JOIN("/a", 100, "/a");
+	JOIN("a/", 100, "a/");
+	JOIN("/a/", 100, "/a/");
+	JOIN("/a/", 100, "/a//");
+	JOIN("//a/", 100, "//a//");
+}
+
+TEST(path_util, JoinTriple)
+{
+	JOIN("/a/b/c", 100, "/a", "b", "c");
+	JOIN("/a/b/c", 100, "/a/", "/b/", "/c");
+	JOIN("/a/b/c", 100, "/a/b/", "/c");
+	JOIN("/a/b/c", 100, "/a/b/c");
+	JOIN("/a/b/c", 100, "/", "a/b/c");
+
+	JOIN("/a/b/c/", 100, "/a/", "/b/", "/c/");
+	JOIN("/a/b/c/", 100, "/a/b/c/");
+	JOIN("/a/b/c/", 100, "/a/b/", "/c/");
+	JOIN("/a/b/c/", 100, "/a/b/c", "/");
+	JOIN("/a/b/c/", 100, "/", "a/b/c", "/");
+}
+
+TEST(path_util, JoinTruncateShort)
+{
+	JOIN("", 1, "/");
+	JOIN("/", 2, "/");
+	JOIN("a", 2, "", "aa");
+	JOIN("a", 2, "", "a/");
+	JOIN("a/b", 4, "a", "bc");
+	JOIN("ab/", 4, "ab", "c");
+	JOIN("/a/", 4, "/a", "b");
+	JOIN("/a/", 4, "/a/", "b/");
+	JOIN("/a/", 4, "/a", "/b/");
+	JOIN("/a/", 4, "/", "a/b/");
+	JOIN("//a", 4, "//", "a/b/");
+
+	JOIN("/a/b", 5, "/a", "b", "c");
+}
+
+TEST(path_util, JoinTruncateLong)
+{
+	JOIN("", 1, "//", "//longer", "path");
+	JOIN("/", 2, "//", "//longer", "path");
+	JOIN("//", 3, "//", "//longer", "path");
+	JOIN("//l", 4, "//", "//longer", "path");
+	/* snip */
+	JOIN("//longe", 8, "//", "//longer", "path");
+	JOIN("//longer", 9, "//", "//longer", "path");
+	JOIN("//longer/", 10, "//", "//longer", "path");
+	JOIN("//longer/p", 11, "//", "//longer", "path");
+	JOIN("//longer/pa", 12, "//", "//longer", "path");
+	JOIN("//longer/pat", 13, "//", "//longer", "path");
+	JOIN("//longer/path", 14, "//", "//longer", "path"); // not truncated
+	JOIN("//longer/path", 14, "//", "//longer", "path/");
+	JOIN("//longer/path/", 15, "//", "//longer", "path/"); // not truncated
+	JOIN("//longer/path/", 15, "//", "//longer", "path/", "trunc");
+	JOIN("//longer/path/t", 16, "//", "//longer", "path/", "trunc");
+}
+
+TEST(path_util, JoinComplex)
+{
+	JOIN("/a/b/c/d/e/f/g/", 100, "/", "\\a/b", "//////c/d", "", "e\\\\", "f", "g//");
+	JOIN("/aa/bb/cc/dd/ee/ff/gg/", 100, "/", "\\aa/bb", "//////cc/dd", "", "ee\\\\", "ff", "gg//");
+	JOIN("1/2/3/", 100, "1", "////////", "", "2", "3\\");
+}
+
+#undef JOIN
+
 /* BLI_path_frame */
-TEST(path_util, PathUtilFrame)
+TEST(path_util, Frame)
 {
 	bool ret;
 
@@ -177,7 +408,7 @@ TEST(path_util, PathUtilFrame)
 }
 
 /* BLI_split_dirfile */
-TEST(path_util, PathUtilSplitDirfile)
+TEST(path_util, SplitDirfile)
 {
 	{
 		const char *path = "";
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index 4b5666e9b5a..f7ca9b02137 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -83,11 +83,14 @@ add_test(script_pyapi_bpy_utils_units ${TEST_BLENDER_EXE}
 	--python ${CMAKE_CURRENT_LIST_DIR}/bl_pyapi_bpy_utils_units.py
 )
 
-# test running mathutils testing script
 add_test(script_pyapi_mathutils ${TEST_BLENDER_EXE}
 	--python ${CMAKE_CURRENT_LIST_DIR}/bl_pyapi_mathutils.py
 )
 
+add_test(script_pyapi_idprop ${TEST_BLENDER_EXE}
+	--python ${CMAKE_CURRENT_LIST_DIR}/bl_pyapi_idprop.py
+)
+
 # ------------------------------------------------------------------------------
 # MODELING TESTS
 add_test(bevel ${TEST_BLENDER_EXE}
@@ -95,6 +98,11 @@ add_test(bevel ${TEST_BLENDER_EXE}
 	--python-text run_tests
 )
 
+add_test(split_faces ${TEST_BLENDER_EXE}
+    ${TEST_SRC_DIR}/modeling/split_faces_test.blend
+    --python-text run_tests
+)
+
 # ------------------------------------------------------------------------------
 # IO TESTS
 
@@ -417,7 +425,12 @@ if(WITH_CYCLES)
 					-idiff "${OPENIMAGEIO_IDIFF}"
 				)
 			endif()
-		endmacro()	
+		endmacro()
+		if(WITH_OPENGL_TESTS)
+			add_cycles_render_test(opengl)
+		endif()
+		add_cycles_render_test(image)
+		add_cycles_render_test(mblur)
 		add_cycles_render_test(reports)
 		add_cycles_render_test(render)
 		add_cycles_render_test(shader)
diff --git a/tests/python/batch_import.py b/tests/python/batch_import.py
index 8fc679a7c15..bbe3a70327f 100644
--- a/tests/python/batch_import.py
+++ b/tests/python/batch_import.py
@@ -48,29 +48,17 @@ import os
 import sys
 
 
-def clear_scene():
-    import bpy
-    unique_obs = set()
-    for scene in bpy.data.scenes:
-        for obj in scene.objects[:]:
-            scene.objects.unlink(obj)
-            unique_obs.add(obj)
-
-    # remove obdata, for now only worry about the startup scene
-    for bpy_data_iter in (bpy.data.objects, bpy.data.meshes, bpy.data.lamps, bpy.data.cameras):
-        for id_data in bpy_data_iter:
-            bpy_data_iter.remove(id_data)
-
-
-def batch_import(operator="",
-                 path="",
-                 save_path="",
-                 match="",
-                 start=0,
-                 end=sys.maxsize,
-                 ):
+def batch_import(
+    operator="",
+    path="",
+    save_path="",
+    match="",
+    start=0,
+    end=sys.maxsize,
+):
     import addon_utils
     _reset_all = addon_utils.reset_all  # XXX, hack
+    _disable_all = addon_utils.disable_all  # XXX, hack
 
     import fnmatch
 
@@ -116,11 +104,12 @@ def batch_import(operator="",
 
         # hack so loading the new file doesn't undo our loaded addons
         addon_utils.reset_all = lambda: None  # XXX, hack
+        addon_utils.disable_all = lambda: None  # XXX, hack
 
-        bpy.ops.wm.read_factory_settings()
+        bpy.ops.wm.read_factory_settings(use_empty=True)
 
         addon_utils.reset_all = _reset_all  # XXX, hack
-        clear_scene()
+        addon_utils.disable_all = _disable_all  # XXX, hack
 
         result = op(filepath=f)
 
diff --git a/tests/python/bl_mesh_modifiers.py b/tests/python/bl_mesh_modifiers.py
index 526a54a49a2..bff2c31984c 100644
--- a/tests/python/bl_mesh_modifiers.py
+++ b/tests/python/bl_mesh_modifiers.py
@@ -31,7 +31,6 @@
 import math
 
 USE_QUICK_RENDER = False
-IS_BMESH = hasattr(__import__("bpy").types, "LoopColors")
 
 # -----------------------------------------------------------------------------
 # utility functions
@@ -203,13 +202,8 @@ def defaults_object(obj):
 
         mesh.show_normal_vertex = True
 
-        # lame!
-        if IS_BMESH:
-            for poly in mesh.polygons:
-                poly.use_smooth = True
-        else:
-            for face in mesh.faces:
-                face.use_smooth = True
+        for poly in mesh.polygons:
+            poly.use_smooth = True
 
 
 def defaults_modifier(mod):
@@ -220,16 +214,14 @@ def defaults_modifier(mod):
 # -----------------------------------------------------------------------------
 # models (utils)
 
+def mesh_bmesh_poly_elems(poly, elems):
+    vert_start = poly.loop_start
+    vert_total = poly.loop_total
+    return elems[vert_start:vert_start + vert_total]
 
-if IS_BMESH:
-    def mesh_bmesh_poly_elems(poly, elems):
-        vert_start = poly.loop_start
-        vert_total = poly.loop_total
-        return elems[vert_start:vert_start + vert_total]
-
-    def mesh_bmesh_poly_vertices(poly):
-        return [loop.vertex_index
-                for loop in mesh_bmesh_poly_elems(poly, poly.id_data.loops)]
+def mesh_bmesh_poly_vertices(poly):
+    return [loop.vertex_index
+            for loop in mesh_bmesh_poly_elems(poly, poly.id_data.loops)]
 
 
 def mesh_bounds(mesh):
@@ -258,21 +250,14 @@ def mesh_uv_add(obj):
 
     uv_lay = obj.data.uv_textures.new()
 
-    if IS_BMESH:
-        # XXX, odd that we need to do this. until UV's and texface
-        # are separated we will need to keep it
-        uv_loops = obj.data.uv_layers[-1]
-        uv_list = uv_loops.data[:]
-        for poly in obj.data.polygons:
-            poly_uvs = mesh_bmesh_poly_elems(poly, uv_list)
-            for i, c in enumerate(poly_uvs):
-                c.uv = uvs[i % 4]
-    else:
-        for uv in uv_lay.data:
-            uv.uv1 = uvs[0]
-            uv.uv2 = uvs[1]
-            uv.uv3 = uvs[2]
-            uv.uv4 = uvs[3]
+    # XXX, odd that we need to do this. until UV's and texface
+    # are separated we will need to keep it
+    uv_loops = obj.data.uv_layers[-1]
+    uv_list = uv_loops.data[:]
+    for poly in obj.data.polygons:
+        poly_uvs = mesh_bmesh_poly_elems(poly, uv_list)
+        for i, c in enumerate(poly_uvs):
+            c.uv = uvs[i % 4]
 
     return uv_lay
 
@@ -296,21 +281,12 @@ def mesh_vcol_add(obj, mode=0):
 
     mesh = obj.data
 
-    if IS_BMESH:
-        col_list = vcol_lay.data[:]
-        for poly in mesh.polygons:
-            face_verts = mesh_bmesh_poly_vertices(poly)
-            poly_cols = mesh_bmesh_poly_elems(poly, col_list)
-            for i, c in enumerate(poly_cols):
-                c.color = colors_get(face_verts[i])
-    else:
-        for i, col in enumerate(vcol_lay.data):
-            face_verts = mesh.faces[i].vertices
-            col.color1 = colors_get(face_verts[0])
-            col.color2 = colors_get(face_verts[1])
-            col.color3 = colors_get(face_verts[2])
-            if len(face_verts) == 4:
-                col.color4 = colors_get(face_verts[3])
+    col_list = vcol_lay.data[:]
+    for poly in mesh.polygons:
+        face_verts = mesh_bmesh_poly_vertices(poly)
+        poly_cols = mesh_bmesh_poly_elems(poly, col_list)
+        for i, c in enumerate(poly_cols):
+            c.color = colors_get(face_verts[i])
 
     return vcol_lay
 
@@ -470,10 +446,7 @@ def modifier_build_add(scene, obj):
     defaults_modifier(mod)
 
     # ensure we display some faces
-    if IS_BMESH:
-        totface = len(obj.data.polygons)
-    else:
-        totface = len(obj.data.faces)
+    totface = len(obj.data.polygons)
 
     mod.frame_start = totface // 2
     mod.frame_duration = totface
diff --git a/tests/python/bl_pyapi_idprop.py b/tests/python/bl_pyapi_idprop.py
new file mode 100644
index 00000000000..0a9cb044571
--- /dev/null
+++ b/tests/python/bl_pyapi_idprop.py
@@ -0,0 +1,144 @@
+# Apache License, Version 2.0
+
+# ./blender.bin --background -noaudio --python tests/python/bl_pyapi_idprop.py -- --verbose
+import bpy
+import unittest
+from array import array
+
+
+class TestHelper:
+
+    @property
+    def id(self):
+        return self._id
+
+    def setUp(self):
+        self._id = bpy.context.scene
+        assert(len(self._id.keys()) == 0)
+
+    def tearDown(self):
+        for key in list(self._id.keys()):
+            del self._id[key]
+
+    def assertAlmostEqualSeq(self, list1, list2):
+        self.assertEqual(len(list1), len(list2))
+        for v1, v2 in zip(list1, list2):
+            self.assertAlmostEqual(v1, v2, places=5)
+
+
+class TestIdPropertyCreation(TestHelper, unittest.TestCase):
+
+    def test_name_empty(self):
+        self.id[""] = 4
+        self.assertEqual(self.id[""], 4)
+
+    def test_name_too_long(self):
+        with self.assertRaises(KeyError):
+            self.id["name" * 30] = 4
+
+    def test_int(self):
+        self.id["a"] = 2
+        self.assertEqual(self.id["a"], 2)
+        self.assertTrue(isinstance(self.id["a"], int))
+
+        with self.assertRaises(OverflowError):
+            self.id["a"] = 2 ** 31  # integer <= 2 ** 31-1
+
+    def test_double(self):
+        self.id["a"] = 2.5
+        self.assertEqual(self.id["a"], 2.5)
+        self.assertTrue(isinstance(self.id["a"], float))
+
+    def test_unicode(self):
+        self.id["a"] = "Hello World"
+        self.assertEqual(self.id["a"], "Hello World")
+        self.assertTrue(isinstance(self.id["a"], str))
+
+    def test_bytes(self):
+        self.id["a"] = b"Hello World"
+        self.assertEqual(self.id["a"], b"Hello World")
+        self.assertTrue(isinstance(self.id["a"], bytes))
+
+    def test_sequence_double_list(self):
+        mylist = [1.2, 3.4, 5.6]
+        self.id["a"] = mylist
+        self.assertEqual(self.id["a"].to_list(), mylist)
+        self.assertEqual(self.id["a"].typecode, "d")
+
+    def test_sequence_int_list(self):
+        mylist = [1, 2, 3]
+        self.id["a"] = mylist
+        self.assertEqual(self.id["a"].to_list(), mylist)
+        self.assertEqual(self.id["a"].typecode, "i")
+
+    def test_sequence_float_array(self):
+        mylist = [1.2, 3.4, 5.6]
+        self.id["a"] = array("f", mylist)
+        self.assertAlmostEqualSeq(self.id["a"].to_list(), mylist)
+        self.assertEqual(self.id["a"].typecode, "d")
+
+    def test_sequence_double_array(self):
+        mylist = [1.2, 3.4, 5.6]
+        self.id["a"] = array("d", mylist)
+        self.assertAlmostEqualSeq(self.id["a"].to_list(), mylist)
+        self.assertEqual(self.id["a"].typecode, "d")
+
+    def test_sequence_int_array(self):
+        mylist = [1, 2, 3]
+        self.id["a"] = array("i", mylist)
+        self.assertAlmostEqualSeq(self.id["a"].to_list(), mylist)
+        self.assertEqual(self.id["a"].typecode, "i")
+
+    def test_sequence_other_array(self):
+        mylist = [1, 2, 3]
+        self.id["a"] = array("Q", mylist)
+        self.assertEqual(self.id["a"].to_list(), mylist)
+
+    def test_sequence_mixed_numerical_type(self):
+        self.id["a"] = [1, 2, 3.4, 5]
+        self.assertAlmostEqualSeq(self.id["a"].to_list(), [1.0, 2.0, 3.4, 5.0])
+        self.assertEqual(self.id["a"].typecode, "d")
+
+    def test_sequence_str_list(self):
+        # I'm a bit surprised that this works
+        mylist = ["abc", "qwe"]
+        self.id["a"] = mylist
+        self.assertEqual(self.id["a"], mylist)
+
+    def test_sequence_mixed_type(self):
+        with self.assertRaises(TypeError):
+            mylist = ["abc", 3, "qwe", 3.4]
+            self.id["a"] = mylist
+
+    def test_mapping_simple(self):
+        mydict = {"1": 10, "2": "20", "3": 30.5}
+        self.id["a"] = mydict
+        self.assertEqual(self.id["a"]["1"], mydict["1"])
+        self.assertEqual(self.id["a"]["2"], mydict["2"])
+        self.assertEqual(self.id["a"]["3"], mydict["3"])
+
+    def test_mapping_complex(self):
+        mydict = {
+            "1": [1, 2, 3],
+            "2": {"1": "abc", "2": array("i", [4, 5, 6])},
+            "3": {"1": {"1": 10}, "2": b"qwe"},
+        }
+        self.id["a"] = mydict
+        self.assertEqual(self.id["a"]["1"].to_list(), [1, 2, 3])
+        self.assertEqual(self.id["a"]["2"]["1"], "abc")
+        self.assertEqual(self.id["a"]["2"]["2"].to_list(), [4, 5, 6])
+        self.assertEqual(self.id["a"]["3"]["1"]["1"], 10)
+        self.assertEqual(self.id["a"]["3"]["2"], b"qwe")
+
+        with self.assertRaises(KeyError):
+            a = self.id["a"]["2"]["a"]
+
+    def test_invalid_type(self):
+        with self.assertRaises(TypeError):
+            self.id["a"] = self
+
+
+if __name__ == '__main__':
+    import sys
+    sys.argv = [__file__] + (sys.argv[sys.argv.index("--") + 1:] if "--" in sys.argv else [])
+    unittest.main()
diff --git a/tests/python/bl_run_operators.py b/tests/python/bl_run_operators.py
index 7e92b424faa..7d5f4127378 100644
--- a/tests/python/bl_run_operators.py
+++ b/tests/python/bl_run_operators.py
@@ -65,6 +65,7 @@ op_blacklist = (
     "wm.blenderplayer_start",
     "wm.recover_auto_save",
     "wm.quit_blender",
+    "wm.window_close",
     "wm.url_open",
     "wm.doc_view",
     "wm.doc_edit",
@@ -308,16 +309,7 @@ def run_ops(operators, setup_func=None, reset=True):
 
 # contexts
 def ctx_clear_scene():  # copied from batch_import.py
-    unique_obs = set()
-    for scene in bpy.data.scenes:
-        for obj in scene.objects[:]:
-            scene.objects.unlink(obj)
-            unique_obs.add(obj)
-
-    # remove obdata, for now only worry about the startup scene
-    for bpy_data_iter in (bpy.data.objects, bpy.data.meshes, bpy.data.lamps, bpy.data.cameras):
-        for id_data in bpy_data_iter:
-            bpy_data_iter.remove(id_data)
+    bpy.ops.wm.read_factory_settings(use_empty=True)
 
 
 def ctx_editmode_mesh():
diff --git a/tests/python/cycles_render_tests.py b/tests/python/cycles_render_tests.py
index 64a71da301a..a030cc5e0de 100755
--- a/tests/python/cycles_render_tests.py
+++ b/tests/python/cycles_render_tests.py
@@ -47,20 +47,41 @@ def printMessage(type, status, message):
 
 
 def render_file(filepath):
-    command = (
-        BLENDER,
-        "--background",
-        "-noaudio",
-        "--factory-startup",
-        "--enable-autoexec",
-        filepath,
-        "-E", "CYCLES",
-        # Run with OSL enabled
-        # "--python-expr", "import bpy; bpy.context.scene.cycles.shading_system = True",
-        "-o", TEMP_FILE_MASK,
-        "-F", "PNG",
-        "-f", "1",
+    dirname = os.path.dirname(filepath)
+    basedir = os.path.dirname(dirname)
+    subject = os.path.basename(dirname)
+    if subject == 'opengl':
+        command = (
+            BLENDER,
+            "--window-geometry", "0", "0", "1", "1",
+            "-noaudio",
+            "--factory-startup",
+            "--enable-autoexec",
+            filepath,
+            "-E", "CYCLES",
+            # Run with OSL enabled
+            # "--python-expr", "import bpy; bpy.context.scene.cycles.shading_system = True",
+            "-o", TEMP_FILE_MASK,
+            "-F", "PNG",
+            '--python', os.path.join(basedir,
+                                     "util",
+                                     "render_opengl.py")
         )
+    else:
+        command = (
+            BLENDER,
+            "--background",
+            "-noaudio",
+            "--factory-startup",
+            "--enable-autoexec",
+            filepath,
+            "-E", "CYCLES",
+            # Run with OSL enabled
+            # "--python-expr", "import bpy; bpy.context.scene.cycles.shading_system = True",
+            "-o", TEMP_FILE_MASK,
+            "-F", "PNG",
+            "-f", "1",
+            )
     try:
         output = subprocess.check_output(command)
         if VERBOSE:
author	Schoen <schoepas@deher1m1598.emea.adsint.biz>	2017-04-03 08:52:05 +0300
committer	Schoen <schoepas@deher1m1598.emea.adsint.biz>	2017-04-03 09:47:02 +0300
commit	90778901c9ea1e16d5907981e91bceba25ff207d (patch)
tree	00bf16ee403c3ab1b507a5ec90b098a22e050f7c
parent	76eebd9379a5dad519ff01cf215fbe3db6de931e (diff)
parent	3bf0026bec6f839ae0f7c40bde845299ce22e8de (diff)